Changeset 2118


Ignore:
Timestamp:
2001-03-05T18:01:57+13:00 (23 years ago)
Author:
jrm21
Message:

Added some code to try to screen out .pdf files likely to cause problems
with pdftohtml.bin, based on some "rule-of-thumb" observations about which
pdf files are likely to have textual data within them, as opposed to image
data.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/pdftohtml.pl

    r2028 r2118  
    4545print STDERR 
    4646    ("pdftohtml version 0.22\n",
    47      "Usage: pdftohtml [options] <PDF-file> [<html-file>]\n",
     47     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
    4848     "  -f <int>      : first page to convert\n",
    4949     "  -l <int>      : last page to convert\n",
     
    9292    my $input_filename = $ARGV[0];
    9393    my $output_filestem = $ARGV[1];
     94    $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
    9495
    9596    my @dir = split (/(\/|\\)/, $input_filename);
     
    99100    if (!-r $input_filename) {
    100101    print STDERR "Error: unable to open $input_filename for reading\n";
     102    exit(1);
     103    }
     104
     105    # Heuristical code added by John McPherson to attempt to reject
     106    # PDF's with no text in them.... based entirely on observation. We
     107    # should really read the PDF specifications someday...
     108    open (PDFIN, $input_filename) ||
     109    die "Error: unable to open $input_filename for reading\n";
     110
     111    my $found_text_object=0;
     112    my $num_objects=0;
     113    my $non_text_objects=0;
     114    my $unenc_stream_objects=0;
     115    my $line;
     116    while (!$found_text_object && ($_=<PDFIN>)) {
     117    s/\r/\n/g;
     118    if (/^\d+ \d+ obj/ms) {
     119        # start of new object
     120        my $object="";
     121        $num_objects++;
     122        while (! eof && ! /(>>\s*)?endobj/) {
     123        $object.=$_;
     124        $_=<PDFIN>;
     125        }
     126        if (!defined $_) {$_="";} # we've hit end of file in a funny place.
     127        # we've got to the end of the current PDF object.
     128        $object.=$_;
     129       
     130        # remove newline chars, to help our pattern matching for whitespace
     131        $object =~ s/\n/ /gs;
     132
     133        #determine object type...
     134        $_=$object;
     135       
     136# for PDFWriter , and pdflatex and distill. Eg:
     137# "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
     138# Ie this looks like compressed text....
     139        if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {
     140        $found_text_object=1;
     141        }
     142        # For pdflatex or ps2pdf from dvi->ps:
     143        # if we are setting a font, then following object is probably text
     144        # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
     145        elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) {
     146        $found_text_object=1;
     147        }
     148        # Unencoded streams. Eg
     149        # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
     150        elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
     151        {
     152        $unenc_stream_objects++;
     153        }
     154        # (some) non-text objects
     155        elsif (/<<.*\/(Type).*>>/s) {
     156        $non_text_objects++;
     157        }
     158
     159    } else { # not in an object...
     160        # header? footer?
     161#       print $_;
     162    }
     163    if ($found_text_object) {close PDFIN;}
     164
     165    } # end of while
     166    close PDFIN;
     167   
     168    # decide whether to accept or reject...
     169    # some of these numbers are completely arbitrary based on a few .pdfs.
     170    if ( ($found_text_object > 0) ||
     171     ($num_objects<=1500 && $unenc_stream_objects > 5)
     172     )
     173    {
     174    # accept this .pdf. Currently do nothing except fall through...
     175    } else {
     176    # reject this .pdf.
     177    print STDERR "pdftohtml.pl: $input_filename appears to have no ";
     178    print STDERR "textual data. Aborting.\n";
     179    print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
    101180    exit(1);
    102181    }
     
    122201    # Need to convert images from PPM format to PNG format
    123202    my @images;
     203
    124204    open (IMAGES, "images.log");
    125205    while (<IMAGES>) {
     
    134214        $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows", "pnmtopng.exe");
    135215        $cmd .= " $image";
    136         if (system($cmd)>0) {
     216        if (system($cmd)!=0) {
    137217        print STDERR "Error executing $cmd\n";
    138218        return 0; # not sure about whether to leave this one in or take it out
     
    143223
    144224        $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
    145         if (system($cmd)>0) {
     225        if (system($cmd)!=0) {
    146226        $cmd = "convert $image $image_base.png 2>/dev/null";
    147         if (system($cmd)>0) {
     227        if (system($cmd)!=0) {
    148228            print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
    149229            return 0; # not sure about whether to leave this one in or take it out
Note: See TracChangeset for help on using the changeset viewer.