Changeset 2118
- Timestamp:
- 2001-03-05T18:01:57+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/pdftohtml.pl
r2028 r2118 45 45 print STDERR 46 46 ("pdftohtml version 0.22\n", 47 "Usage: pdftohtml [options] <PDF-file> [<html-file>]\n",47 "Usage: pdftohtml [options] <PDF-file> <html-file>\n", 48 48 " -f <int> : first page to convert\n", 49 49 " -l <int> : last page to convert\n", … … 92 92 my $input_filename = $ARGV[0]; 93 93 my $output_filestem = $ARGV[1]; 94 $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix 94 95 95 96 my @dir = split (/(\/|\\)/, $input_filename); … … 99 100 if (!-r $input_filename) { 100 101 print STDERR "Error: unable to open $input_filename for reading\n"; 102 exit(1); 103 } 104 105 # Heuristical code added by John McPherson to attempt to reject 106 # PDF's with no text in them.... based entirely on observation. We 107 # should really read the PDF specifications someday... 108 open (PDFIN, $input_filename) || 109 die "Error: unable to open $input_filename for reading\n"; 110 111 my $found_text_object=0; 112 my $num_objects=0; 113 my $non_text_objects=0; 114 my $unenc_stream_objects=0; 115 my $line; 116 while (!$found_text_object && ($_=<PDFIN>)) { 117 s/\r/\n/g; 118 if (/^\d+ \d+ obj/ms) { 119 # start of new object 120 my $object=""; 121 $num_objects++; 122 while (! eof && ! /(>>\s*)?endobj/) { 123 $object.=$_; 124 $_=<PDFIN>; 125 } 126 if (!defined $_) {$_="";} # we've hit end of file in a funny place. 127 # we've got to the end of the current PDF object. 128 $object.=$_; 129 130 # remove newline chars, to help our pattern matching for whitespace 131 $object =~ s/\n/ /gs; 132 133 #determine object type... 134 $_=$object; 135 136 # for PDFWriter , and pdflatex and distill. Eg: 137 # "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..." 138 # Ie this looks like compressed text.... 139 if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) { 140 $found_text_object=1; 141 } 142 # For pdflatex or ps2pdf from dvi->ps: 143 # if we are setting a font, then following object is probably text 144 # Eg "obj << /Font" or "obj << /ProcSet [...] /Font" 145 elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) { 146 $found_text_object=1; 147 } 148 # Unencoded streams. Eg 149 # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..." 150 elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s) 151 { 152 $unenc_stream_objects++; 153 } 154 # (some) non-text objects 155 elsif (/<<.*\/(Type).*>>/s) { 156 $non_text_objects++; 157 } 158 159 } else { # not in an object... 160 # header? footer? 161 # print $_; 162 } 163 if ($found_text_object) {close PDFIN;} 164 165 } # end of while 166 close PDFIN; 167 168 # decide whether to accept or reject... 169 # some of these numbers are completely arbitrary based on a few .pdfs. 170 if ( ($found_text_object > 0) || 171 ($num_objects<=1500 && $unenc_stream_objects > 5) 172 ) 173 { 174 # accept this .pdf. Currently do nothing except fall through... 175 } else { 176 # reject this .pdf. 177 print STDERR "pdftohtml.pl: $input_filename appears to have no "; 178 print STDERR "textual data. Aborting.\n"; 179 print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n"; 101 180 exit(1); 102 181 } … … 122 201 # Need to convert images from PPM format to PNG format 123 202 my @images; 203 124 204 open (IMAGES, "images.log"); 125 205 while (<IMAGES>) { … … 134 214 $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows", "pnmtopng.exe"); 135 215 $cmd .= " $image"; 136 if (system($cmd) >0) {216 if (system($cmd)!=0) { 137 217 print STDERR "Error executing $cmd\n"; 138 218 return 0; # not sure about whether to leave this one in or take it out … … 143 223 144 224 $cmd = "pnmtopng $image > $image_base.png 2>/dev/null"; 145 if (system($cmd) >0) {225 if (system($cmd)!=0) { 146 226 $cmd = "convert $image $image_base.png 2>/dev/null"; 147 if (system($cmd) >0) {227 if (system($cmd)!=0) { 148 228 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n"; 149 229 return 0; # not sure about whether to leave this one in or take it out
Note:
See TracChangeset
for help on using the changeset viewer.