Context Navigation

← Previous Changeset
Next Changeset →

Changeset 2118

Timestamp:

2001-03-05T18:01:57+13:00 (23 years ago)

Author:

jrm21

Message:

Added some code to try to screen out .pdf files likely to cause problems
with pdftohtml.bin, based on some "rule-of-thumb" observations about which
pdf files are likely to have textual data within them, as opposed to image
data.

File:

: 1 edited

trunk/gsdl/bin/script/pdftohtml.pl (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/bin/script/pdftohtml.pl

-              r2028
+              r2118
 print STDERR
     ("pdftohtml version 0.22\n",
      "Usage: pdftohtml [options] <PDF-file> [<html-file>]\n",
+     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
      "  -f <int>      : first page to convert\n",
      "  -l <int>      : last page to convert\n",
 …
     my $input_filename = $ARGV[0];
     my $output_filestem = $ARGV[1];
+    $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
     my @dir = split (/(\/|\\)/, $input_filename);
 …
     if (!-r $input_filename) {
     print STDERR "Error: unable to open $input_filename for reading\n";
+    exit(1);
+    }
+    # Heuristical code added by John McPherson to attempt to reject
+    # PDF's with no text in them.... based entirely on observation. We
+    # should really read the PDF specifications someday...
+    open (PDFIN, $input_filename) ||
+    die "Error: unable to open $input_filename for reading\n";
+    my $found_text_object=0;
+    my $num_objects=0;
+    my $non_text_objects=0;
+    my $unenc_stream_objects=0;
+    my $line;
+    while (!$found_text_object && ($_=<PDFIN>)) {
+    s/\r/\n/g;
+    if (/^\d+ \d+ obj/ms) {
+        # start of new object
+        my $object="";
+        $num_objects++;
+        while (! eof && ! /(>>\s*)?endobj/) {
+        $object.=$_;
+        $_=<PDFIN>;
+        }
+        if (!defined $_) {$_="";} # we've hit end of file in a funny place.
+        # we've got to the end of the current PDF object.
+        $object.=$_;
+        # remove newline chars, to help our pattern matching for whitespace
+        $object =~ s/\n/ /gs;
+        #determine object type...
+        $_=$object;
+# for PDFWriter , and pdflatex and distill. Eg:
+# "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
+# Ie this looks like compressed text....
+        if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {
+        $found_text_object=1;
+        }
+        # For pdflatex or ps2pdf from dvi->ps:
+        # if we are setting a font, then following object is probably text
+        # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
+        elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) {
+        $found_text_object=1;
+        }
+        # Unencoded streams. Eg
+        # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
+        elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
+        {
+        $unenc_stream_objects++;
+        }
+        # (some) non-text objects
+        elsif (/<<.*\/(Type).*>>/s) {
+        $non_text_objects++;
+        }
+    } else { # not in an object...
+        # header? footer?
+#       print $_;
+    }
+    if ($found_text_object) {close PDFIN;}
+    } # end of while
+    close PDFIN;
+    # decide whether to accept or reject...
+    # some of these numbers are completely arbitrary based on a few .pdfs.
+    if ( ($found_text_object > 0) ||
+     ($num_objects<=1500 && $unenc_stream_objects > 5)
+     )
+    {
+    # accept this .pdf. Currently do nothing except fall through...
+    } else {
+    # reject this .pdf.
+    print STDERR "pdftohtml.pl: $input_filename appears to have no ";
+    print STDERR "textual data. Aborting.\n";
+    print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
     exit(1);
+    }
 …
     # Need to convert images from PPM format to PNG format
     my @images;
     open (IMAGES, "images.log");
     while (<IMAGES>) {
 …
         $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows", "pnmtopng.exe");
         $cmd .= " $image";
         if (system($cmd)>0) {
+        if (system($cmd)!=0) {
         print STDERR "Error executing $cmd\n";
         return 0; # not sure about whether to leave this one in or take it out
 …
         $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
         if (system($cmd)>0) {
+        if (system($cmd)!=0) {
         $cmd = "convert $image $image_base.png 2>/dev/null";
         if (system($cmd)>0) {
+        if (system($cmd)!=0) {
             print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
             return 0; # not sure about whether to leave this one in or take it out

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2118

Legend:

trunk/gsdl/bin/script/pdftohtml.pl

Download in other formats: