Changeset 2352 for trunk/gsdl/bin/script/pdftohtml.pl
- Timestamp:
- 2001-05-02T14:01:55+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/pdftohtml.pl
r2346 r2352 43 43 44 44 sub print_usage { 45 # note - we don't actually ever use most of these options... 45 46 print STDERR 46 ("pdftohtml version 0.22 \n",47 ("pdftohtml version 0.22 - modified for NZDL use\n", 47 48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n", 48 49 " -f <int> : first page to convert\n", … … 53 54 " -h : print this usage information\n", 54 55 " -p : exchange .pdf links by .html\n", 55 " -c : generate complex HTML document\n", 56 " -F : don't use frames in HTML document\n", 56 # these options now have no effect in gs-custom pdftohtml.bin 57 # " -c : generate complex HTML document\n", 58 # " -F : don't use frames in HTML document\n", 57 59 " -i : ignore images\n", 58 60 " -e <string> : set extension for images (in the Html-file) (default png)\n" … … 64 66 my (@ARGV) = @_; 65 67 my ($first,$last,$target_dir,$out_file,$img_ext, 66 $optq,$opth,$optp,$opt c,$optF,$opti);68 $optq,$opth,$optp,$optF,$opti); 67 69 68 70 # read command-line arguments so that … … 77 79 'h', \$opth, 78 80 'p', \$optp, 79 'c', \$optc,81 # 'c', \$optc, 80 82 'F', \$optF, 81 83 'i', \$opti … … 103 105 } 104 106 105 # Heuristical code added by John McPherson to attempt to reject 106 # PDF's with no text in them.... based entirely on observation. We 107 # should really read the PDF specifications someday... 108 open (PDFIN, $input_filename) || 109 die "Error: unable to open $input_filename for reading\n"; 107 # Heuristical code removed due to pdftohtml.bin being "fixed" to not 108 # create bitmaps for each char in some pdfs. However, this means we 109 # now create .html files even if we can't extract any text. We should 110 # check for that now instead someday... 110 111 111 my $found_text_object=0;112 my $num_objects=0;113 my $non_text_objects=0;114 my $unenc_stream_objects=0;115 my $line;116 while (!$found_text_object && ($_=<PDFIN>)) {117 s/\r/\n/g;118 if (/^\d+ \d+ obj/ms) {119 # start of new object120 my $object="";121 $num_objects++;122 while (! eof && ! /(>>\s*)?endobj/) {123 $object.=$_;124 $_=<PDFIN>;125 }126 if (!defined $_) {$_="";} # we've hit end of file in a funny place.127 # we've got to the end of the current PDF object.128 $object.=$_;129 130 # remove newline chars, to help our pattern matching for whitespace131 $object =~ s/\n/ /gs;132 133 #determine object type...134 $_=$object;135 136 # for PDFWriter , and pdflatex and distill. Eg:137 # "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."138 # Ie this looks like compressed text....139 if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {140 $found_text_object=1;141 }142 # For pdflatex or ps2pdf from dvi->ps:143 # if we are setting a font, then following object is probably text144 # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"145 elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) {146 $found_text_object=1;147 }148 # Unencoded streams. Eg149 # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."150 elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)151 {152 $unenc_stream_objects++;153 }154 # (some) non-text objects155 elsif (/<<.*\/(Type).*>>/s) {156 $non_text_objects++;157 }158 159 } else { # not in an object...160 # header? footer?161 # print $_;162 }163 if ($found_text_object) {close PDFIN;}164 165 } # end of while166 close PDFIN;167 168 # decide whether to accept or reject...169 # some of these numbers are completely arbitrary based on a few .pdfs.170 if ( ($found_text_object > 0) ||171 ($num_objects<=1500 && $unenc_stream_objects > 5)172 )173 {174 # accept this .pdf. Currently do nothing except fall through...175 } else {176 # reject this .pdf.177 print STDERR "pdftohtml.pl: $input_filename appears to have no ";178 print STDERR "textual data. Aborting.\n";179 # print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";180 exit(1);181 }182 112 183 113 # formulate the command
Note:
See TracChangeset
for help on using the changeset viewer.