Changeset 1734


Ignore:
Timestamp:
2000-12-01T16:36:33+13:00 (23 years ago)
Author:
jrm21
Message:

For postscript, fall back to some simple text extraction if ps2ascii isn't
found. (This should be portable as it is perl). It won't be formatted though,
so currently is only useful for indexing - users will have to view the
postscript for now...

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r1705 r1734  
    8484    # Deduce filenames
    8585    my ($tailname,$dirname,$suffix)
    86     = File::Basename::fileparse($input_filename,'\.[^\.]+');
     86    = File::Basename::fileparse($input_filename,'\..+');
    8787    my $output_filestem = &util::filename_cat($dirname,"$tailname");
    8888
     
    102102    }
    103103    elsif ($input_type eq "doc") {
     104    print STDERR "I recognise this to be a Word document...\n"; # remove
    104105    print &convertDOC($input_filename, $output_filestem, $output_type);
    105106    print "\n";
     
    148149    my $realtype = &find_docfile_type($input_filename);
    149150
    150     if ($realtype eq "word678") {
     151    print STDERR "The real type of this Word document is $realtype\n"; # remove
     152
     153    if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
     154    print STDERR "I recognise this to be a word678 document...\n"; # remove
    151155    return &convertWord678($input_filename, $output_filestem, $output_type);
    152156    } elsif ($realtype eq "rtf") {
     
    166170    # Attempt specialised conversion to HTML
    167171    if (!$output_type || ($output_type =~ /html/i)) {
     172    print STDERR "I am about to call doc_to_html...\n";
    168173    $success = &doc_to_html($input_filename, $output_filestem);
    169174    if ($success) {
     
    278283    ($input_filename) = @_;
    279284   
     285    open(TMP, ">temp.txt");
     286    binmode(TMP);
    280287    open(CHK, "<$input_filename");
     288    binmode(CHK);
    281289    my $line = "";
    282290    my $first = 1;
     
    285293   
    286294    $line = $_;
    287 
     295    print TMP "$line\n\n";
    288296    if ($first) {
    289297        # check to see if this is an rtf file
     
    294302    }
    295303   
    296     # is theis a word 6/7/8 document?
    297     if ($line =~ /Word\.Document\.[678]/) {
     304    # is this is a word 6/7/8 document?
     305    if ($line =~ /Word\.Document\.([678])/) {
    298306        close(CHK);
    299         return "word678";
     307        return "word$1";
    300308    }
    301309
     
    309317
    310318
    311 # Specific type-to-type cponversions
     319# Specific type-to-type conversions
    312320#
    313321# Each of the following functions attempts to convert a document from
     
    320328
    321329sub doc_to_html {
     330    print STDERR "/;-DG I am in doc_to_html...\n"; # remove
    322331    ($input_filename, $output_filestem) = @_;
    323332
    324     # formulate the command
    325     my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
    326     my $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
    327     my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
     333    my $wvWare = "";
     334    my $wv_conf = "";
     335    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     336    $wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
     337    $wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
     338
     339    } else {
     340    # formulate the command
     341    my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
     342    $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
     343    $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
     344    }
     345    print STDERR "I am about to test if your file exists...\n";
    328346    return 0 unless (-e "$wvWare");
    329347    $cmd = "";
     
    332350    $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
    333351
     352    print STDERR "$cmd\n"; #remove
     353
    334354    # execute the command
     355    print STDERR system($cmd);
     356    print STDERR "\n";
    335357    if (system($cmd)>0)
    336358    {
     
    352374    }
    353375    }
     376    print STDERR "/;-DG I am leaving doc_to_html...\n";
    354377    return 0;
    355378}
     
    488511    my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
    489512    $cmd .= " 2> $output_filestem.err";
    490    
    491513    if (system($cmd)>0)
    492514    {
     
    494516    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    495517    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    496     return 0;
    497     }
    498 
     518
     519    # Fine then. We'll just do a lousy job by ourselves...
     520    # Based on code nicked from:
     521    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
     522    #
     523    print STDERR "Attempting to strip text from postscript.\n";
     524    my $errorcode=0;
     525    open (IN, "$input_filename")
     526        ||  ($errorcode=1, warn "Couldn't read file: $!");
     527    open (OUT, ">$output_filestem.text")
     528        ||  ($errorcode=1, warn "Couldn't write file: $!");
     529    if ($errorcode) {print STDERR "errors\n";return 0;}
     530
     531    my $in_a_sentence=0;
     532    while (<IN>) {
     533        if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
     534        # attempt to add whitespace between different lines...
     535        s/F.?\(/\( /g; # this might break up some other words though...
     536        ### remove all postscript control data
     537        if (!$in_a_sentence) {
     538        s/^[^\(\)]*?\(//;}  # rm start of line up to first open bracket
     539        s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
     540        s/\)([^\(\)])*?\(//g ;   # close bracket up to next open unquoted bracket
     541        if (s/\)[^\(\)]*?$//g)  # last close bracket to end of line
     542        {$in_a_sentence=0;chomp;}     
     543        if (s/\\$//) # if line is a continuation
     544        {$in_a_sentence=1;chomp;}
     545        s/^$//g ;             # remove empty lines
     546        ### ligatures have special characters...
     547        s/\\214/fi/g;
     548        s/\\215/fl/g;
     549        print OUT "$_";
     550    }
     551    close IN; close OUT;
     552    }
    499553    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    500554    return 1;
     
    506560
    507561sub any_to_html {
     562    print STDERR "/;-Dg I am in any_to_html!\n";
    508563    ($input_filename, $output_filestem) = @_;
    509564
     
    518573<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
    519574<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
    520 </head><body>\n\n';
     575</head><body>';
     576    print HTML "\n\n";
     577
    521578    while (<TEXT>) {
    522579    print HTML "<p> ", $_;
    523580   
    524581    }
    525     print HTML "\n</body></html>]\n";
     582    print HTML "\n</body></html>\n";
    526583
    527584    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
     585    print STDERR "/;-Dg I am getting out of  any_to_html!\n";
    528586    return 1;
    529587}
     
    535593    ($input_filename, $output_filestem) = @_;
    536594
     595    #open(TEMP, ">temp.txt");
    537596    open(IN, "<$input_filename");
     597    binmode(IN);
    538598    open(OUT, ">$output_filestem.text");
    539599
    540600    my ($line);
     601    my $dgcount = 0;
    541602    while (<IN>) {
    542603    $line = $_;
    543    
     604
    544605    # delete anything that isn't a printable character
     606    #print TEMP $line;
    545607    $line =~ s/[^\040-\176]+/\n/sg;
    546608
    547609    # delete any string less than 10 characters long
    548     $line =~ s/^[^\n]{0,9}$/\n/mg;
    549     while ($line =~ /^[^\n]{1,9}$/m) {
    550         $line =~ s/^[^\n]{0,9}$/\n/mg;
     610    $line =~ s/^.{0,9}$/\n/mg;
     611    while ($line =~ /^.{1,9}$/m) {
     612        $line =~ s/^.{0,9}$/\n/mg;
    551613        $line =~ s/\n+/\n/sg;
    552614    }
Note: See TracChangeset for help on using the changeset viewer.