Changeset 2352


Ignore:
Timestamp:
2001-05-02T14:01:55+12:00 (23 years ago)
Author:
jrm21
Message:

removed crappy heuristical code that tried to check for extractable text
first, due to pdftohtml.bin being updated (and not breaking (much) anymore).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/pdftohtml.pl

    r2346 r2352  
    4343
    4444sub print_usage {
     45# note - we don't actually ever use most of these options...
    4546print STDERR 
    46     ("pdftohtml version 0.22\n",
     47    ("pdftohtml version 0.22 - modified for NZDL use\n",
    4748     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
    4849     "  -f <int>      : first page to convert\n",
     
    5354     "  -h            : print this usage information\n",
    5455     "  -p            : exchange .pdf links by .html\n",
    55      "  -c            : generate complex HTML document\n",
    56      "  -F            : don't use frames in HTML document\n",
     56# these options now have no effect in gs-custom pdftohtml.bin
     57#     "  -c            : generate complex HTML document\n",
     58#     "  -F            : don't use frames in HTML document\n",
    5759     "  -i            : ignore images\n",
    5860     "  -e <string>   : set extension for images (in the Html-file) (default png)\n"
     
    6466    my (@ARGV) = @_;
    6567    my ($first,$last,$target_dir,$out_file,$img_ext,
    66     $optq,$opth,$optp,$optc,$optF,$opti);
     68    $optq,$opth,$optp,$optF,$opti);
    6769   
    6870    # read command-line arguments so that
     
    7779             'h', \$opth,
    7880             'p', \$optp,
    79              'c', \$optc,
     81#            'c', \$optc,
    8082             'F', \$optF,
    8183             'i', \$opti
     
    103105    }
    104106
    105     # Heuristical code added by John McPherson to attempt to reject
    106     # PDF's with no text in them.... based entirely on observation. We
    107     # should really read the PDF specifications someday...
    108     open (PDFIN, $input_filename) ||
    109     die "Error: unable to open $input_filename for reading\n";
     107    # Heuristical code removed due to pdftohtml.bin being "fixed" to not
     108    # create bitmaps for each char in some pdfs. However, this means we
     109    # now create .html files even if we can't extract any text. We should
     110    # check for that now instead someday...
    110111
    111     my $found_text_object=0;
    112     my $num_objects=0;
    113     my $non_text_objects=0;
    114     my $unenc_stream_objects=0;
    115     my $line;
    116     while (!$found_text_object && ($_=<PDFIN>)) {
    117     s/\r/\n/g;
    118     if (/^\d+ \d+ obj/ms) {
    119         # start of new object
    120         my $object="";
    121         $num_objects++;
    122         while (! eof && ! /(>>\s*)?endobj/) {
    123         $object.=$_;
    124         $_=<PDFIN>;
    125         }
    126         if (!defined $_) {$_="";} # we've hit end of file in a funny place.
    127         # we've got to the end of the current PDF object.
    128         $object.=$_;
    129        
    130         # remove newline chars, to help our pattern matching for whitespace
    131         $object =~ s/\n/ /gs;
    132 
    133         #determine object type...
    134         $_=$object;
    135        
    136 # for PDFWriter , and pdflatex and distill. Eg:
    137 # "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
    138 # Ie this looks like compressed text....
    139         if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {
    140         $found_text_object=1;
    141         }
    142         # For pdflatex or ps2pdf from dvi->ps:
    143         # if we are setting a font, then following object is probably text
    144         # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
    145         elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) {
    146         $found_text_object=1;
    147         }
    148         # Unencoded streams. Eg
    149         # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
    150         elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
    151         {
    152         $unenc_stream_objects++;
    153         }
    154         # (some) non-text objects
    155         elsif (/<<.*\/(Type).*>>/s) {
    156         $non_text_objects++;
    157         }
    158 
    159     } else { # not in an object...
    160         # header? footer?
    161 #       print $_;
    162     }
    163     if ($found_text_object) {close PDFIN;}
    164 
    165     } # end of while
    166     close PDFIN;
    167    
    168     # decide whether to accept or reject...
    169     # some of these numbers are completely arbitrary based on a few .pdfs.
    170     if ( ($found_text_object > 0) ||
    171      ($num_objects<=1500 && $unenc_stream_objects > 5)
    172      )
    173     {
    174     # accept this .pdf. Currently do nothing except fall through...
    175     } else {
    176     # reject this .pdf.
    177     print STDERR "pdftohtml.pl: $input_filename appears to have no ";
    178     print STDERR "textual data. Aborting.\n";
    179     # print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
    180     exit(1);
    181     }
    182112
    183113    # formulate the command
Note: See TracChangeset for help on using the changeset viewer.