Changeset 2031


Ignore:
Timestamp:
2001-02-20T18:54:46+13:00 (23 years ago)
Author:
jrm21
Message:

Improved postscript to text handling a little bit better.
Also, system($cmd) return value can be "-1", not just ">0"....

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r2023 r2031  
    418418    $cmd .= " \"$input_filename\" \"$output_filestem\"";
    419419
    420     if (system($cmd)>0)
     420    if (system($cmd)!=0)
    421421    {
    422422    print STDERR "Error executing $cmd: $!\n";
     
    489489
    490490    my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
    491     $cmd .= "-f ps2ascii \"$input_filename\" -c quit > \"$output_filestem.text\"";
     491    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
    492492    $cmd .= " 2> $output_filestem.err";
    493493    $!=0;
     
    509509    close PSOUT;
    510510    }
    511 #    print STDERR "retcode=$retcode, error=\"$!\"\n";
    512511    if ($error ne "")
    513512    {
     
    517516
    518517    # Fine then. We'll just do a lousy job by ourselves...
    519     # Based on regexps nicked from:
     518    # Based on 5-line regexp sed script found at:
    520519    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
    521520    #
     
    528527    if ($errorcode) {print STDERR "errors\n";return 0;}
    529528   
    530     my $in_a_sentence=0;
     529    my $text="";  # this is for whole .ps file...
    531530    while (<IN>) {
    532         if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
    533         # attempt to add whitespace between different lines...
    534         s/F.?\(/\( /g; # this might break up some other words though...
    535         ### remove all postscript control data
    536         if (!$in_a_sentence) {
    537         s/^[^\(\)]*?\(//;  # rm start of line up to first open bracket
    538         }
    539         s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
    540         s/\)([^\(\)])*?\(//g ;   # close bracket up to next open unquoted bracket
    541         if (s/\)[^\(\)]*?$//g)  # last close bracket to end of line
    542         {$in_a_sentence=0;chomp;}     
    543         if (s/\\$//) # if line is a continuation
    544         {$in_a_sentence=1;chomp;}
    545         s/^$//g ;             # remove empty lines
    546         ### ligatures have special characters...
    547         s/\\214/fi/g;
    548         s/\\215/fl/g;
    549         print OUT "$_";
    550     }
    551     close IN; close OUT;
    552     }
    553 
     531        $text.=$_;
     532    }
     533    close IN;
     534
     535    # if ps has Page data, then use it to delete all stuff before it.
     536    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
     537   
     538    # remove all leading non-data stuff
     539    $text =~ s/^.*?\(//s;
     540
     541    # remove all newline chars for easier processing
     542    $text =~ s/\n//g;
     543   
     544    # Big assumption here - assume that if any co-ordinates are
     545    # given, then we are at the end of a sentence.
     546    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
     547
     548    # special characters--
     549    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
     550
     551    # ? ps text formatting (eg italics?) ?
     552    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
     553    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
     554    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
     555    # default - remove the rest
     556    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
     557
     558    # attempt to add whitespace between words...
     559    # this is based purely on observation, and may be completely wrong...
     560    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
     561    # eg I notice "b(" is sometimes NOT a space if preceded by a
     562    # negative number.
     563    $text =~ s/\)\d+ ?b\(/\) \( /g;
     564
     565    # change quoted braces to brackets
     566    $text =~ s/([^\\])\\\(/$1\{/g;
     567    $text =~ s/([^\\])\\\)/$1\}/g ;
     568
     569    # remove everything that is not between braces
     570    $text =~ s/\)([^\(\)])+?\(//sg ;
     571   
     572    # remove any Trailer eof stuff.
     573    $text =~ s/\)[^\)]*$//sg;
     574
     575    ### ligatures have special characters...
     576    $text =~ s/\\013/ff/g;
     577    $text =~ s/\\014/fi/g;
     578    $text =~ s/\\015/fl/g;
     579    $text =~ s/\\016/ffi/g;
     580    $text =~ s/\\214/fi/g;
     581    $text =~ s/\\215/fl/g;
     582    $text =~ s/\\017/\n\* /g; # asterisk?
     583    $text =~ s/\\023/\023/g;  # e acute ('e)
     584    $text =~ s/\\177/\252/g;  # u"
     585#   $text =~ s/ ?? /\344/g;  # a"
     586
     587    print OUT "$text";
     588    close OUT;
     589    }
    554590    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    555591    return 1;
Note: See TracChangeset for help on using the changeset viewer.