Changeset 2031

Show
Ignore:
Timestamp:
20.02.2001 18:54:46 (19 years ago)
Author:
jrm21
Message:

Improved postscript to text handling a little bit better.
Also, system($cmd) return value can be "-1", not just ">0"....

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r2023 r2031  
    418418    $cmd .= " \"$input_filename\" \"$output_filestem\""; 
    419419 
    420     if (system($cmd)>0) 
     420    if (system($cmd)!=0) 
    421421    { 
    422422    print STDERR "Error executing $cmd: $!\n"; 
     
    489489 
    490490    my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save "; 
    491     $cmd .= "-f ps2ascii \"$input_filename\" -c quit > \"$output_filestem.text\""; 
     491    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\""; 
    492492    $cmd .= " 2> $output_filestem.err"; 
    493493    $!=0; 
     
    509509    close PSOUT; 
    510510    } 
    511 #    print STDERR "retcode=$retcode, error=\"$!\"\n"; 
    512511    if ($error ne "") 
    513512    { 
     
    517516 
    518517    # Fine then. We'll just do a lousy job by ourselves... 
    519     # Based on regexps nicked from: 
     518    # Based on 5-line regexp sed script found at: 
    520519    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html 
    521520    #  
     
    528527    if ($errorcode) {print STDERR "errors\n";return 0;} 
    529528     
    530     my $in_a_sentence=0; 
     529    my $text="";  # this is for whole .ps file... 
    531530    while (<IN>) { 
    532         if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line 
    533         # attempt to add whitespace between different lines... 
    534         s/F.?\(/\( /g; # this might break up some other words though... 
    535         ### remove all postscript control data 
    536         if (!$in_a_sentence) { 
    537         s/^[^\(\)]*?\(//;  # rm start of line up to first open bracket 
    538         } 
    539         s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces 
    540         s/\)([^\(\)])*?\(//g ;   # close bracket up to next open unquoted bracket 
    541         if (s/\)[^\(\)]*?$//g)  # last close bracket to end of line 
    542         {$in_a_sentence=0;chomp;}      
    543         if (s/\\$//) # if line is a continuation 
    544         {$in_a_sentence=1;chomp;}  
    545         s/^$//g ;             # remove empty lines 
    546         ### ligatures have special characters... 
    547         s/\\214/fi/g; 
    548         s/\\215/fl/g; 
    549         print OUT "$_"; 
    550     } 
    551     close IN; close OUT; 
    552     } 
    553  
     531        $text.=$_; 
     532    } 
     533    close IN; 
     534 
     535    # if ps has Page data, then use it to delete all stuff before it. 
     536    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line 
     537     
     538    # remove all leading non-data stuff 
     539    $text =~ s/^.*?\(//s; 
     540 
     541    # remove all newline chars for easier processing 
     542    $text =~ s/\n//g; 
     543     
     544    # Big assumption here - assume that if any co-ordinates are 
     545    # given, then we are at the end of a sentence. 
     546    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g; 
     547 
     548    # special characters-- 
     549    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash? 
     550 
     551    # ? ps text formatting (eg italics?) ? 
     552    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> { 
     553    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> } 
     554    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> | 
     555    # default - remove the rest 
     556    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g; 
     557 
     558    # attempt to add whitespace between words...  
     559    # this is based purely on observation, and may be completely wrong... 
     560    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g; 
     561    # eg I notice "b(" is sometimes NOT a space if preceded by a  
     562    # negative number. 
     563    $text =~ s/\)\d+ ?b\(/\) \( /g; 
     564 
     565    # change quoted braces to brackets 
     566    $text =~ s/([^\\])\\\(/$1\{/g; 
     567    $text =~ s/([^\\])\\\)/$1\}/g ; 
     568 
     569    # remove everything that is not between braces 
     570    $text =~ s/\)([^\(\)])+?\(//sg ; 
     571     
     572    # remove any Trailer eof stuff. 
     573    $text =~ s/\)[^\)]*$//sg; 
     574 
     575    ### ligatures have special characters... 
     576    $text =~ s/\\013/ff/g; 
     577    $text =~ s/\\014/fi/g; 
     578    $text =~ s/\\015/fl/g; 
     579    $text =~ s/\\016/ffi/g; 
     580    $text =~ s/\\214/fi/g; 
     581    $text =~ s/\\215/fl/g; 
     582    $text =~ s/\\017/\n\* /g; # asterisk? 
     583    $text =~ s/\\023/\023/g;  # e acute ('e) 
     584    $text =~ s/\\177/\252/g;  # u" 
     585#   $text =~ s/ ?? /\344/g;  # a" 
     586 
     587    print OUT "$text"; 
     588    close OUT; 
     589    } 
    554590    &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 
    555591    return 1;