Changeset 2755 for trunk/gsdl/bin/script/pdftohtml.pl
- Timestamp:
- 2001-09-26T10:43:44+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/pdftohtml.pl
r2743 r2755 45 45 # note - we don't actually ever use most of these options... 46 46 print STDERR 47 ("pdftohtml version 0.22 - modified for NZDL use\n",47 ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n", 48 48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n", 49 " -f <int> : first page to convert\n", 50 " -l <int> : last page to convert\n", 51 " -d <dir> : target directory (default: basename of pdf-file)\n", 52 " -o <file> : name of output file; - means stdout (default index.html)\n", 53 " -q : don't print any messages or errors\n", 54 " -h : print this usage information\n", 55 " -p : exchange .pdf links by .html\n", 56 # these options now have no effect in gs-custom pdftohtml 57 # " -c : generate complex HTML document\n", 58 # " -F : don't use frames in HTML document\n", 59 " -i : ignore images\n", 60 " -e <string> : set extension for images (in the Html-file) (default png)\n" 49 "Options:\n", 50 "\t-i\tignore images (don't extract)\n", 51 "\t-a\tallow images only (continue even if no text is present)\n" 61 52 ); 62 53 exit (1); … … 65 56 sub main { 66 57 my (@ARGV) = @_; 67 my ($first,$last,$target_dir,$out_file,$img_ext, 68 $optq,$opth,$optp,$optF,$opti); 58 my ($allow_no_text,$ignore_images); 69 59 70 60 # read command-line arguments so that 71 61 # you can change the command in this script 72 62 if (!parsargv::parse(\@ARGV, 73 'f/\d+/1', \$first, 74 'l/\d+/1', \$last, 75 'd/[\S]*/', \$target_dir, 76 'o/[\S]*/', \$out_file, 77 'e/[\S]*/', \$img_ext, 78 'q', \$optq, 79 'h', \$opth, 80 'p', \$optp, 81 # 'c', \$optc, 82 'F', \$optF, 83 'i', \$opti 63 'a', \$allow_no_text, 64 'i', \$ignore_images 84 65 )) 85 66 { … … 119 100 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/); 120 101 102 $cmd .= " -i" if ($ignore_images); 121 103 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\""; 122 $cmd .= " > \"$output_filestem.out\"";123 124 # attempting to redirect STDERR on windows 95/98 is a bad idea125 $cmd .= " 2> \"$output_filestem.err\""126 if $ENV{'GSDLOS'} !~ /^windows$/i;127 104 128 105 # system() returns -1 if it can't run, otherwise it's $cmds ret val. 129 106 # note we return 0 if the file is "encrypted" 107 $!=0; 130 108 if (system($cmd)!=0) { 131 print STDERR " Error executing $cmd:$!\n";109 print STDERR "pdftohtml error for $input_filename $!\n"; 132 110 # leave these for gsConvert.pl... 133 111 #&util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 134 112 #&util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 135 return 0;113 return 1; 136 114 } 137 115 138 116 if (! -e "$output_filestem.html") { 139 return 0;117 return 1; 140 118 } 141 119 142 120 # post-process to remove </b><b> and </i><i>, as these break up 143 121 # words, screwing up indexing and searching. 122 # At the same time, check that our .html file has some textual content. 144 123 &util::mv("$output_filestem.html","$output_filestem.html.tmp"); 124 $!=0; 145 125 open INFILE, "$output_filestem.html.tmp" || 146 126 die "Couldn't open file: $!"; … … 148 128 die "Couldn't open file for writing: $!"; 149 129 my $line; 130 my $seen_textual_content=$allow_no_text; 150 131 while ($line=<INFILE>) { 151 132 $line =~ s#</b><b>##g; 152 133 $line =~ s#</i><i>##g; 153 134 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed... 135 # check for any extracted text 136 if ($seen_textual_content == 0) { 137 my $tmp_line=$line; 138 $tmp_line =~ s/<[^>]*>//g; 139 $tmp_line =~ s/Page\s\d+//; 140 $tmp_line =~ s/\s*//g; 141 if ($tmp_line ne "") { 142 $seen_textual_content=1; 143 } 144 } 145 154 146 # escape underscores, but not if they're inside tags (eg img/href names) 155 147 my $inatag = 0; # allow multi-line tags … … 178 170 &util::rm("$output_filestem.html.tmp"); 179 171 180 181 172 # Need to convert images from PPM format to PNG format 182 173 my @images; … … 192 183 } 193 184 close IMAGES; 185 &util::rm("${directory}image.log") if (-e "${directory}image.log"); 186 187 # no need to go any further if there is no text extracted from pdf. 188 if ($seen_textual_content == 0) { 189 print STDERR "Error: PDF contains no extractable text\n"; 190 # remove images... 191 for $image (@images) { 192 chomp($image); 193 &util::rm("${directory}$image"); 194 } 195 return 1; 196 } 197 198 194 199 195 200 for $image (@images) { … … 200 205 if (system($cmd)!=0) { 201 206 print STDERR "Error executing $cmd\n"; 202 #return 0; # not sure about whether to leave this one in or take it out207 #return 1; # not sure about whether to leave this one in or take it out 203 208 next; 204 209 } … … 211 216 if (system($cmd)!=0) { 212 217 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n"; 213 #return 0; # not sure about whether to leave this one in or take it out218 #return 1; # not sure about whether to leave this one in or take it out 214 219 next; 215 220 } … … 219 224 } 220 225 221 return 1;226 return 0; 222 227 } 223 228 224 # indicate our error status 225 if (&main(@ARGV)) {exit 0;} 226 exit 1; 229 # indicate our error status, 0 = success 230 exit (&main(@ARGV)); 231
Note:
See TracChangeset
for help on using the changeset viewer.