Changeset 1654


Ignore:
Timestamp:
2000-11-03T15:22:00+13:00 (23 years ago)
Author:
paynter
Message:

Check .doc files to see if they are RTF files, Word 6/7/8 files that wv
handles, or "unknown" files (which we strip of binary characters and hope
the result is worthwhile).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r1578 r1654  
    137137    ($input_filename, $output_filestem, $output_type) = @_;
    138138
     139    # Many .doc files are not in fact word documents!
     140    my $realtype = &find_docfile_type($input_filename);
     141
     142    if ($realtype eq "word678") {
     143    return &convertWord678($input_filename, $output_filestem, $output_type);
     144    } elsif ($realtype eq "rtf") {
     145    return &convertRTF($input_filename, $output_filestem, $output_type);
     146    } else {
     147    return &convertAnything($input_filename, $output_filestem, $output_type);
     148    }
     149}
     150
     151# Convert a Microsoft word 6/7/8 document
     152
     153sub convertWord678 {
     154    ($input_filename, $output_filestem, $output_type) = @_;
     155
    139156    my $success = 0;
    140157
     
    147164    }
    148165
     166    return &convertAnything($input_filename, $output_filestem, $output_type);
     167}
     168
     169
     170# Convert a Rich Text Format (RTF) file
     171
     172sub convertRTF {
     173    ($input_filename, $output_filestem, $output_type) = @_;
     174
     175    my $success = 0;
     176
     177    # Attempt specialised conversion to HTML
     178    if (!$output_type || ($output_type =~ /html/i)) {
     179    $success = &rtf_to_html($input_filename, $output_filestem);
     180    if ($success) {
     181        return "html";
     182    }
     183    }
     184
     185    return &convertAnything($input_filename, $output_filestem, $output_type);
     186}
     187
     188
     189# Convert an unidentified file
     190
     191sub convertAnything {
     192    ($input_filename, $output_filestem, $output_type) = @_;
     193   
     194    my $success = 0;
     195   
    149196    # Attempt simple conversion to HTML
    150197    if (!$output_type || ($output_type =~ /html/i)) {
     
    162209    }
    163210    }
    164 
    165211    return "fail";
    166 
    167 }
     212}
     213
    168214
    169215
     
    213259    return "fail";
    214260
     261}
     262
     263
     264# Find the real type of a .doc file
     265#
     266# We seem to have alot of files with a .dco extension that are .rtf
     267# files or Word 5 files.  This function attempts to tell the difference.
     268
     269sub find_docfile_type {
     270    ($input_filename) = @_;
     271   
     272    open(CHK, "<$input_filename");
     273    my $line = "";
     274    my $first = 1;
     275
     276    while (<CHK>) {
     277   
     278    $line = $_;
     279
     280    if ($first) {
     281        # check to see if this is an rtf file
     282        if ($line =~ /^\{\\rtf/) {
     283        close(CHK);
     284        return "rtf";
     285        }
     286    }
     287   
     288    # is theis a word 6/7/8 document?
     289    if ($line =~ /Word\.Document\.[678]/) {
     290        close(CHK);
     291        return "word678";
     292    }
     293
     294    $first = 0;
     295
     296    }
     297
     298    return "unknown";
    215299}
    216300
     
    235319    my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
    236320    return 0 unless (-e "$wvWare");
    237     $cmd = "$wvWare --charset utf-8 --config $wv_conf";
     321    $cmd = "ulimit -t 20;";
     322    $cmd .= "$wvWare --charset utf-8 --config $wv_conf";
    238323    $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
    239    
     324
    240325    # execute the command
    241326    if (system($cmd)>0)
     
    262347
    263348
     349# Attempt to convert an RTF document to html with rtftohtml
     350#
     351# rtf2html isn't distributed with Greenstone because it is not
     352# distributed under teh GPL.  If you know of a better solution,
     353# please let me know.
     354
     355sub rtf_to_html {
     356    ($input_filename, $output_filestem) = @_;
     357
     358    # formulate the command
     359    my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
     360                    "rtf2html", "rtf2html", "rtf2html");
     361    $r_cmd = "rtf2html" unless (-e "$r_cmd");
     362    return 0 unless (-e "$r_cmd");
     363    $cmd = "ulimit -t 20;";
     364    $cmd .= "$r_cmd";
     365    $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
     366
     367    # execute the command
     368    if (system($cmd)>0)
     369    {
     370    print STDERR "Error executing rtf converter: $!. Continuing...\n";
     371    }
     372
     373    # Was the conversion successful?
     374    if (-e "$output_filestem.html") {
     375    open(TMP, "$output_filestem.html");
     376    $line = <TMP>;
     377    close(TMP);
     378    if ($line && $line =~ /DOCTYPE HTML/) {
     379        &util::rm("$output_filestem.err");
     380        return 1;
     381    } else {
     382        # An error of some sort occurred
     383        &util::rm("$output_filestem.html");
     384        &util::rm("$output_filestem.err");
     385    }
     386    }
     387    return 0;
     388}
     389
     390
    264391# Convert a pdf file to html with the pdftohtml command
    265392
Note: See TracChangeset for help on using the changeset viewer.