Changeset 22429


Ignore:
Timestamp:
2010-07-19T13:28:14+12:00 (12 years ago)
Author:
davidb
Message:

Support of using OpenOffice scripting through JODConverter.jar added. Also added in 'use strict' and then fixed up a variety of places that needed 'my' added

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r20933 r22429  
    4949}
    5050
     51use strict;
     52
    5153use parsargv;
    5254use util;
     
    6567my $pdf_allow_images_only;
    6668my $windows_scripting;
     69my $openoffice_scripting;
    6770
    6871sub print_usage
     
    7275    print STDERR "              or text using third-party programs.\n\n";
    7376    print STDERR "  usage: $0 [options] filename\n";
    74     print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
     77    if ($openoffice_scripting) {
     78    print STDERR "  options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n";
     79    }
     80    else {
     81    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
     82    }
    7583    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
    7684    print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
    7785    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    7886    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
    79     print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
     87    print STDERR "\t-windows_scripting\tuse windows script (if available) when converting Microsoft Word and PPT via VB script\n";
     88    print STDERR "\t-openoffice_scripting\tuse openoffice script (if available) when converting Microsoft Word and PPT via OpenOffice\n";
    8089    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
    8190    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
     
    96105    my ($input_type,$output_type,$verbose);
    97106
     107   
     108    # scan for -openoffice_scripting as it effects the permissible
     109    # values for -type
     110
     111    foreach my $a (@ARGV) {
     112    if ($a =~ m/^-openoffice_scripting$/) {
     113        $openoffice_scripting = 1;
     114        last;
     115    }
     116    }
     117
     118    my $parse_type;
     119    if ($openoffice_scripting) {
     120    $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/';
     121    }
     122    else {
     123    $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/';
     124    }
     125
    98126    # read command-line arguments
    99127    if (!parsargv::parse(\@ARGV,
    100              'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
     128             $parse_type, \$input_type,
    101129             '/errlog/.*/', \$faillogfile,
    102130             'output/(auto|html|text|pagedimage).*/', \$output_type,
    103131             'timeout/\d+/0',\$timeout,
    104132             'verbose/\d+/0', \$verbose,
     133             'windows_scripting',\$windows_scripting,
     134             'openoffice_scripting',\$openoffice_scripting,
    105135             'use_strings', \$use_strings,
    106              'windows_scripting',\$windows_scripting,
    107136             'pdf_complex', \$pdf_complex,
    108137             'pdf_ignore_images', \$pdf_ignore_images,
     
    144173    print STDERR "Error: No filename extension or input type defined\n";
    145174    exit(1);
     175    }
     176    elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) {
     177    print &convertDOC($input_filename, $output_filestem, $output_type);
     178    print "\n";
    146179    }
    147180    elsif ($input_type eq "doc" || $input_type eq "dot") {
     
    158191    }
    159192    elsif ($input_type eq "ps") {
    160     print &convertPS($input_filename, $output_filestem, $output_type);
     193    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
    161194    print "\n";
    162195    }
     
    195228
    196229sub convertDOC {
    197     ($input_filename, $output_filestem, $output_type) = @_;
     230    my ($input_filename, $output_filestem, $output_type) = @_;
     231
     232    if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {
     233    # Jump right in and process with Open Office
     234        if (openoffice_doc_to_html($input_filename, $output_filestem)) {
     235        return "html";
     236    }
     237    else {
     238        return "fail";
     239    }
     240    }
    198241
    199242    # Many .doc files are not in fact word documents!
     
    212255
    213256sub convertWord678 {
    214     ($input_filename, $output_filestem, $output_type) = @_;
     257    my ($input_filename, $output_filestem, $output_type) = @_;
    215258
    216259    my $success = 0;
     
    219262        $success = &native_doc_to_html($input_filename, $output_filestem);
    220263    }
     264    elsif ($openoffice_scripting) {
     265        $success = &openoffice_doc_to_html($input_filename, $output_filestem);
     266    }
    221267    else {
    222268        $success = &doc_to_html($input_filename, $output_filestem);   
     
    233279
    234280sub convertRTF {
    235     ($input_filename, $output_filestem, $output_type) = @_;
     281    my ($input_filename, $output_filestem, $output_type) = @_;
    236282
    237283    my $success = 0;
     
    242288    if ($windows_scripting) {
    243289        $success = &native_doc_to_html($input_filename, $output_filestem);
     290    }
     291    elsif ($openoffice_scripting) {
     292        $success = &openoffice_doc_to_html($input_filename, $output_filestem);
    244293    }
    245294    else {
     
    261310
    262311sub convertAnything {
    263     ($input_filename, $output_filestem, $output_type) = @_;
     312    my ($input_filename, $output_filestem, $output_type) = @_;
    264313   
    265314    my $success = 0;
     
    324373
    325374sub convertPS {
    326     ($input_filename, $output_filestem, $output_type) = @_;
     375    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
    327376
    328377    my $success = 0;
     
    365414    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
    366415           
    367     $cmd = "";
     416    my $cmd = "";
    368417    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    369     # if the converting directory has already existed
     418    # if the converting directory already exists
    370419    if (-d $output_filestem) {
    371         print STDERR "**The conversion directory has existed\n";
     420        print STDERR "**The conversion directory already exists\n";
    372421        return "item";
    373422    } else {
     
    385434    #if (!$output_type || ($output_type =~ m/html/i)) {
    386435    # formulate the command
    387     $cmd = "";
     436    my $cmd = "";
    388437    $cmd .= "perl -S ppttohtml.pl ";
    389438    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
     
    418467    if (!$output_type || ($output_type =~ m/html/i)) {
    419468    # formulate the command
    420     $cmd = "";
     469    my $cmd = "";
    421470    $cmd .= "perl -S xlstohtml.pl ";
    422471    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
     
    450499# files or Word 5 files.  This function attempts to tell the difference.
    451500sub find_docfile_type {
    452     ($input_filename) = @_;
     501    my ($input_filename) = @_;
    453502   
    454503    open(CHK, "<$input_filename");
     
    492541# Attempt to convert a word document to html with the wv program
    493542sub doc_to_html {
    494     ($input_filename, $output_filestem) = @_;
     543    my ($input_filename, $output_filestem) = @_;
    495544
    496545    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
     
    612661    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
    613662    open(TMP, "$output_filestem.html");
    614     $line = <TMP>;
     663    my $line = <TMP>;
    615664    close(TMP);
    616665    if ($line && $line =~ m/DOCTYPE HTML/) {
     
    750799# Attempt to convert a word document to html with the word2html scripting program
    751800sub native_doc_to_html {
    752     ($input_filename, $output_filestem) = @_;
     801    my ($input_filename, $output_filestem) = @_;
    753802
    754803    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
     
    757806    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
    758807    if (-e "$output_filestem.html") {
    759     print STDERR "*** The conversion file has existed\n";
     808    print STDERR "    The conversion file:\n";
     809    print STDERR "      $output_filestem.html\n";
     810    print STDERR "    ... already exists.  Skipping\n";
    760811    return 1;
    761812    }
     
    803854    if (-s "$output_filestem.html") {
    804855    open(TMP, "$output_filestem.html");
    805     $line = <TMP>;
     856    my $line = <TMP>;
    806857    close(TMP);
    807     if ($line && $line =~ m/html/) {
     858    if ($line && $line =~ m/html/i) {
    808859        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
    809860        return 1;
     
    825876}
    826877
     878# Attempt to convert a word document to html with JODConvert scripting program
     879sub openoffice_doc_to_html {
     880    my ($input_filename, $output_filestem) = @_;
     881
     882    if (-e "$output_filestem.html") {
     883    print STDERR "    The conversion file:\n";
     884    print STDERR "      $output_filestem.html\n";
     885    print STDERR "    ... skipping\n";
     886    return 1;
     887    }
     888
     889    my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");
     890    my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");
     891    if (!-e $oo2html) {
     892    print STDERR "Error: Unable to find 'oo2html' in: \n";
     893    print STDERR "       $oo_script_dir\n";
     894    print STDERR "       Is the OpenOffice extension to Greenstone installed?\n";
     895    return 0;
     896    }
     897
     898    my $cmd = "";
     899    if ($timeout) {$cmd = "ulimit -t $timeout;";}
     900    $cmd .=  "$oo2html \"$input_filename\" \"$output_filestem.html\"";
     901
     902    # redirecting STDERR
     903    $cmd .= " 2> \"$output_filestem.err\""
     904    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
     905   
     906    # execute the command
     907    $!=0;
     908    if (system($cmd)!=0)
     909    {
     910    print STDERR "Error executing oo2html converter: $!\n";
     911    print STDERR "Command was: $cmd\n";
     912
     913    if (-s "$output_filestem.err") {
     914        open (ERRFILE, "<$output_filestem.err");
     915       
     916        my $write_to_fail_log=0;
     917        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     918        {$write_to_fail_log=1;}
     919
     920        my $line;
     921        while ($line=<ERRFILE>) {
     922        if ($line =~ m/\w/) {
     923            print STDERR "$line";
     924            print FAILLOG "$line" if ($write_to_fail_log);
     925        }
     926        if ($line !~ m/startup error/) {next;}
     927        print STDERR " (given an invalid .DOC file?)\n";
     928        print FAILLOG " (given an invalid .DOC file?)\n"
     929        if ($write_to_fail_log);
     930       
     931        } # while ERRFILE
     932        close FAILLOG if ($write_to_fail_log);
     933    }
     934    return 0; # we can try any_to_text
     935    }
     936
     937    # Was the conversion successful?
     938    if (-s "$output_filestem.html") {
     939    open(TMP, "$output_filestem.html");
     940    my $line = <TMP>;
     941    close(TMP);
     942    if ($line && $line =~ m/html/i) {
     943        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
     944        return 1;
     945    }
     946    }
     947   
     948    # If here, an error of some sort occurred
     949   
     950    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
     951    if (-e "$output_filestem.err") {
     952    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
     953        open (ERRLOG,"$output_filestem.err");
     954        while (<ERRLOG>) {print FAILLOG $_;}
     955        close FAILLOG;
     956        close ERRLOG;
     957    }
     958    &util::rm("$output_filestem.err");
     959    }
     960    return 0;
     961}
     962
    827963# Attempt to convert an RTF document to html with rtftohtml
    828 
    829964sub rtf_to_html {
    830965    my ($input_filename, $output_filestem) = @_;
    831966
    832967    # formulate the command
    833     $cmd = "";
     968    my $cmd = "";
    834969    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    835970    $cmd .= "rtftohtml";
     
    9031038        my $line;
    9041039        while ($line=<TOC>) {
    905         $line =~ s@</body></html>$@@ ; # only last line has this
     1040        $line =~ s@</body></html>$@@i ; # only last line has this
    9061041        # make link relative
    907         $line =~ s@href=\"[^\#]+@href=\"@;
     1042        $line =~ s@href=\"[^\#]+@href=\"@i;
    9081043        print HTML $line;
    9091044        }
     
    9491084    my ($dirname, $input_filename, $output_filestem) = @_;
    9501085
    951     $cmd = "";
     1086    my $cmd = "";
    9521087    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    9531088    $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
     
    10211156    }
    10221157
    1023     $cmd = "";
     1158    my $cmd = "";
    10241159    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    10251160    $output_type =~ s/.*\_(.*)/$1/i;
     
    13161451
    13171452sub any_to_html {
    1318     ($input_filename, $output_filestem) = @_;
     1453    my ($input_filename, $output_filestem) = @_;
    13191454
    13201455    # First generate a text file
     
    13541489
    13551490sub any_to_text {
    1356     ($input_filename, $output_filestem) = @_;
     1491    my ($input_filename, $output_filestem) = @_;
    13571492
    13581493    if (!$use_strings) {
Note: See TracChangeset for help on using the changeset viewer.