Changeset 2012


Ignore:
Timestamp:
2001-02-19T18:06:16+13:00 (23 years ago)
Author:
jrm21
Message:

re-added the crappy PS text-stripper, and made the error handling
a bit more robust.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r1997 r2012  
    276276# Find the real type of a .doc file
    277277#
    278 # We seem to have alot of files with a .dco extension that are .rtf
     278# We seem to have a lot of files with a .doc extension that are .rtf
    279279# files or Word 5 files.  This function attempts to tell the difference.
    280280
     
    487487}
    488488
    489 # Convert a PostScript document to text with ps2ascii
     489# Convert a PostScript document to text
     490# note - just using "ps2ascii" isn't good enough, as it
     491# returns 0 for a postscript interpreter error. ps2ascii is just
     492# a wrapper to "gs" anyway, so we use that cmd here.
    490493
    491494sub ps_to_text {
    492495    ($input_filename, $output_filestem) = @_;
    493496
    494     my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
     497    my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
     498    $cmd .= "-f ps2ascii \"$input_filename\" -c quit > \"$output_filestem.text\"";
    495499    $cmd .= " 2> $output_filestem.err";
    496    
    497     if (system($cmd)>0)
    498     {
    499     print STDERR "Error executing $cmd: $!\n";
     500    $!=0;
     501    my $retcode=system($cmd);
     502    $retcode = $? >> 8;  # see man perlfunc - system for this...
     503    # if system returns -1 | 127 (couldn't start program), look at $! for message
     504    my $error="";
     505    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
     506    elsif (! -e "$output_filestem.text") {
     507    $error="did not create output file.\n";
     508    }
     509    else
     510    { # make sure the interpreter didn't get an error. It is technically
     511    # possible for the actual text to start with this, but....
     512    open PSOUT, "$output_filestem.text";
     513    if (<PSOUT> =~ /^Error: (.*)/) {
     514        $error="interpreter error - \"$1\"";
     515    }
     516    close PSOUT;
     517    }
     518#    print STDERR "retcode=$retcode, error=\"$!\"\n";
     519    if ($error ne "")
     520    {
     521    print STDERR "PSPLUG: WARNING: Error executing gs: $error\n";
    500522    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    501523    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    502     return 0;
     524
     525    # Fine then. We'll just do a lousy job by ourselves...
     526    # Based on regexps nicked from:
     527    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
     528    #
     529    print STDERR "PSPlug: Stripping text from postscript\n";
     530    my $errorcode=0;
     531    open (IN, "$input_filename")
     532        ||  ($errorcode=1, warn "Couldn't read file: $!");
     533    open (OUT, ">$output_filestem.text")
     534        ||  ($errorcode=1, warn "Couldn't write file: $!");
     535    if ($errorcode) {print STDERR "errors\n";return 0;}
     536   
     537    my $in_a_sentence=0;
     538    while (<IN>) {
     539        if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
     540        # attempt to add whitespace between different lines...
     541        s/F.?\(/\( /g; # this might break up some other words though...
     542        ### remove all postscript control data
     543        if (!$in_a_sentence) {
     544        s/^[^\(\)]*?\(//;  # rm start of line up to first open bracket
     545        }
     546        s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
     547        s/\)([^\(\)])*?\(//g ;   # close bracket up to next open unquoted bracket
     548        if (s/\)[^\(\)]*?$//g)  # last close bracket to end of line
     549        {$in_a_sentence=0;chomp;}     
     550        if (s/\\$//) # if line is a continuation
     551        {$in_a_sentence=1;chomp;}
     552        s/^$//g ;             # remove empty lines
     553        ### ligatures have special characters...
     554        s/\\214/fi/g;
     555        s/\\215/fl/g;
     556        print OUT "$_";
     557    }
     558    close IN; close OUT;
    503559    }
    504560
Note: See TracChangeset for help on using the changeset viewer.