Changeset 1960


Ignore:
Timestamp:
2001-02-13T11:48:10+13:00 (23 years ago)
Author:
dg5
Message:

Modified pdftohtml.pl to reflect the change in location of pdftohtml.bin file

Location:
trunk/gsdl/bin/script
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r1928 r1960  
    6060    my (@ARGV) = @_;
    6161    my ($input_type,$output_type,$verbose,$timeout);
    62    
     62
    6363    $timeout = 0;
    6464    # read command-line arguments
     
    167167    # Attempt specialised conversion to HTML
    168168    if (!$output_type || ($output_type =~ /html/i)) {
    169     print STDERR "I am about to call doc_to_html...\n";
    170169    $success = &doc_to_html($input_filename, $output_filestem);
    171170    if ($success) {
     
    280279    ($input_filename) = @_;
    281280   
    282     open(TMP, ">temp.txt");
    283     binmode(TMP);
    284281    open(CHK, "<$input_filename");
    285282    binmode(CHK);
     
    290287   
    291288    $line = $_;
    292     print TMP "$line\n\n";
     289
    293290    if ($first) {
    294291        # check to see if this is an rtf file
     
    494491    my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
    495492    $cmd .= " 2> $output_filestem.err";
     493   
    496494    if (system($cmd)>0)
    497495    {
     
    499497    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    500498    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    501 
    502     # Fine then. We'll just do a lousy job by ourselves...
    503     # Based on code nicked from:
    504     # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
    505     #
    506     print STDERR "Attempting to strip text from postscript.\n";
    507     my $errorcode=0;
    508     open (IN, "$input_filename")
    509         ||  ($errorcode=1, warn "Couldn't read file: $!");
    510     open (OUT, ">$output_filestem.text")
    511         ||  ($errorcode=1, warn "Couldn't write file: $!");
    512     if ($errorcode) {print STDERR "errors\n";return 0;}
    513 
    514     my $in_a_sentence=0;
    515     while (<IN>) {
    516         if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
    517         # attempt to add whitespace between different lines...
    518         s/F.?\(/\( /g; # this might break up some other words though...
    519         ### remove all postscript control data
    520         if (!$in_a_sentence) {
    521         s/^[^\(\)]*?\(//;}  # rm start of line up to first open bracket
    522         s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
    523         s/\)([^\(\)])*?\(//g ;   # close bracket up to next open unquoted bracket
    524         if (s/\)[^\(\)]*?$//g)  # last close bracket to end of line
    525         {$in_a_sentence=0;chomp;}     
    526         if (s/\\$//) # if line is a continuation
    527         {$in_a_sentence=1;chomp;}
    528         s/^$//g ;             # remove empty lines
    529         ### ligatures have special characters...
    530         s/\\214/fi/g;
    531         s/\\215/fl/g;
    532         print OUT "$_";
    533     }
    534     close IN; close OUT;
    535     }
     499    return 0;
     500    }
     501
    536502    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    537503    return 1;
     
    574540    ($input_filename, $output_filestem) = @_;
    575541
    576     #open(TEMP, ">temp.txt");
    577542    open(IN, "<$input_filename");
    578543    binmode(IN);
     
    585550
    586551    # delete anything that isn't a printable character
    587     #print TEMP $line;
    588552    $line =~ s/[^\040-\176]+/\n/sg;
    589553
  • trunk/gsdl/bin/script/pdftohtml.pl

    r1928 r1960  
    110110    else  {
    111111    $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "pdftohtml");
    112     $pdftohtml = &util::filename_cat($p_home, "pdftohtml_0_22", "pdftohtml.bin");
     112    $pdftohtml = &util::filename_cat($p_home, "bin", "pdftohtml.bin");
    113113    }
    114114    return 0 unless (-e "$pdftohtml");
Note: See TracChangeset for help on using the changeset viewer.