Changeset 2599


Ignore:
Timestamp:
2001-06-20T14:32:24+12:00 (23 years ago)
Author:
jrm21
Message:

we now do some post-processing to fix up words broken by html tags - eg
<b>Wo</b><b>rds</b> would go to mg as "Wo" and "rds", not "Words". It also
makes the html cleaner. We currently only do this for <b> and <i> tags.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/pdftohtml.pl

    r2575 r2599  
    135135    }
    136136
     137# post-process to remove </b><b> and </i><i>, as these break up
     138# words, screwing up indexing and searching.
     139    &util::mv("$output_filestem.html","$output_filestem.html.tmp");
     140    open INFILE, "$output_filestem.html.tmp" ||
     141    die "Couldn't open file: $!";
     142    open OUTFILE, ">$output_filestem.html" ||
     143    die "Couldn't open file for writing: $!";
     144    my $line;
     145    while ($line=<INFILE>) {
     146    $line =~ s#</b><b>##g;
     147    $line =~ s#</i><i>##g;
     148    print OUTFILE $line;
     149    }
     150    close INFILE;
     151    close OUTFILE;
     152    &util::rm("$output_filestem.html.tmp");
     153
     154
    137155    # Need to convert images from PPM format to PNG format
    138156    my @images;
     
    154172        if (system($cmd)!=0) {
    155173        print STDERR "Error executing $cmd\n";
    156         return 0; # not sure about whether to leave this one in or take it out
     174        #return 0; # not sure about whether to leave this one in or take it out
     175        next;
    157176        }
    158177    } else {
     
    165184        if (system($cmd)!=0) {
    166185            print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
    167             return 0; # not sure about whether to leave this one in or take it out
     186            #return 0; # not sure about whether to leave this one in or take it out
     187            next;
    168188        }
    169189        }
     
    175195}
    176196
    177 &main(@ARGV);
     197# indicate our error status
     198if (&main(@ARGV)) {exit 0;}
     199exit 1;
Note: See TracChangeset for help on using the changeset viewer.