source: trunk/gsdl/bin/script/pdftohtml.pl@ 2289

Last change on this file since 2289 was 2289, checked in by jrm21, 23 years ago

check if system() returns != 0, rather than just > 0 (-1 => can't run).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.6 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert documents to HTML ot TEXT format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 1999 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45print STDERR
46 ("pdftohtml version 0.22\n",
47 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
48 " -f <int> : first page to convert\n",
49 " -l <int> : last page to convert\n",
50 " -d <dir> : target directory (default: basename of pdf-file)\n",
51 " -o <file> : name of output file; - means stdout (default index.html)\n",
52 " -q : don't print any messages or errors\n",
53 " -h : print this usage information\n",
54 " -p : exchange .pdf links by .html\n",
55 " -c : generate complex HTML document\n",
56 " -F : don't use frames in HTML document\n",
57 " -i : ignore images\n",
58 " -e <string> : set extension for images (in the Html-file) (default png)\n"
59 );
60exit (1);
61}
62
63sub main {
64 my (@ARGV) = @_;
65 my ($first,$last,$target_dir,$out_file,$img_ext,
66 $optq,$opth,$optp,$optc,$optF,$opti);
67
68 # read command-line arguments so that
69 # you can change the command in this script
70 if (!parsargv::parse(\@ARGV,
71 'f/\d+/1', \$first,
72 'l/\d+/1', \$last,
73 'd/[\S]*/', \$target_dir,
74 'o/[\S]*/', \$out_file,
75 'e/[\S]*/', \$img_ext,
76 'q', \$optq,
77 'h', \$opth,
78 'p', \$optp,
79 'c', \$optc,
80 'F', \$optF,
81 'i', \$opti
82 ))
83 {
84 print_usage();
85 }
86
87 # Make sure the input file exists and can be opened for reading
88 if (scalar(@ARGV!=2)) {
89 print_usage();
90 }
91
92 my $input_filename = $ARGV[0];
93 my $output_filestem = $ARGV[1];
94 $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
95
96 my @dir = split (/(\/|\\)/, $input_filename);
97 pop(@dir);
98 my $dir = join ("", @dir);
99
100 if (!-r $input_filename) {
101 print STDERR "Error: unable to open $input_filename for reading\n";
102 exit(1);
103 }
104
105 # Heuristical code added by John McPherson to attempt to reject
106 # PDF's with no text in them.... based entirely on observation. We
107 # should really read the PDF specifications someday...
108 open (PDFIN, $input_filename) ||
109 die "Error: unable to open $input_filename for reading\n";
110
111 my $found_text_object=0;
112 my $num_objects=0;
113 my $non_text_objects=0;
114 my $unenc_stream_objects=0;
115 my $line;
116 while (!$found_text_object && ($_=<PDFIN>)) {
117 s/\r/\n/g;
118 if (/^\d+ \d+ obj/ms) {
119 # start of new object
120 my $object="";
121 $num_objects++;
122 while (! eof && ! /(>>\s*)?endobj/) {
123 $object.=$_;
124 $_=<PDFIN>;
125 }
126 if (!defined $_) {$_="";} # we've hit end of file in a funny place.
127 # we've got to the end of the current PDF object.
128 $object.=$_;
129
130 # remove newline chars, to help our pattern matching for whitespace
131 $object =~ s/\n/ /gs;
132
133 #determine object type...
134 $_=$object;
135
136# for PDFWriter , and pdflatex and distill. Eg:
137# "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
138# Ie this looks like compressed text....
139 if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {
140 $found_text_object=1;
141 }
142 # For pdflatex or ps2pdf from dvi->ps:
143 # if we are setting a font, then following object is probably text
144 # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
145 elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) {
146 $found_text_object=1;
147 }
148 # Unencoded streams. Eg
149 # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
150 elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
151 {
152 $unenc_stream_objects++;
153 }
154 # (some) non-text objects
155 elsif (/<<.*\/(Type).*>>/s) {
156 $non_text_objects++;
157 }
158
159 } else { # not in an object...
160 # header? footer?
161# print $_;
162 }
163 if ($found_text_object) {close PDFIN;}
164
165 } # end of while
166 close PDFIN;
167
168 # decide whether to accept or reject...
169 # some of these numbers are completely arbitrary based on a few .pdfs.
170 if ( ($found_text_object > 0) ||
171 ($num_objects<=1500 && $unenc_stream_objects > 5)
172 )
173 {
174 # accept this .pdf. Currently do nothing except fall through...
175 } else {
176 # reject this .pdf.
177 print STDERR "pdftohtml.pl: $input_filename appears to have no ";
178 print STDERR "textual data. Aborting.\n";
179 # print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
180 exit(1);
181 }
182
183 # formulate the command
184 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml.bin");
185
186 # don't include path on windows (to avoid having to play about
187 # with quoting when GSDLHOME might contain spaces) but assume
188 # that the PATH is set up correctly - note also that on windows
189 # we use pdftohtml.exe not pdftohtml.bin
190 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
191
192 if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
193 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
194 $cmd .= " > \"$output_filestem.out\"";
195
196 # attempting to redirect STDERR on windows 95/98 is a bad idea
197 $cmd .= " 2> \"$output_filestem.err\""
198 if $ENV{'GSDLOS'} !~ /^windows$/i;
199
200# system() returns -1 if it can't run, otherwise it's $cmds ret val.
201 if (system($cmd)!=0) {
202 print STDERR "Error executing $cmd: $!\n";
203 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
204 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
205 return 0;
206 }
207
208 # Need to convert images from PPM format to PNG format
209 my @images;
210
211
212 open (IMAGES, "images.log");
213 while (<IMAGES>) {
214 push (@images, $_);
215 }
216 close IMAGES;
217
218 for $image (@images) {
219 chomp($image);
220 my $cmd = "";
221 if ($ENV{'GSDLOS'} =~ /^windows/i) {
222 $cmd = "pnmtopng $image";
223 if (system($cmd)!=0) {
224 print STDERR "Error executing $cmd\n";
225 return 0; # not sure about whether to leave this one in or take it out
226 }
227 } else {
228 my @nameparts = split(/\./, $image);
229 my $image_base = shift(@nameparts);
230
231 $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
232 if (system($cmd)!=0) {
233 $cmd = "convert $image $image_base.png 2>/dev/null";
234 if (system($cmd)!=0) {
235 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
236 return 0; # not sure about whether to leave this one in or take it out
237 }
238 }
239 }
240 &util::rm($image);
241 }
242
243 return 1;
244}
245
246&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.