Context Navigation

source: trunk/gsdl/bin/script/pdftohtml.pl@ 2289

Last change on this file since 2289 was 2289, checked in by jrm21, 23 years ago
check if system() returns != 0, rather than just > 0 (-1 => can't run).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.6 KB

Line
1	#!/usr/bin/perl -w
2
3
4	###########################################################################
5	#
6	# pdftohtml.pl -- convert documents to HTML ot TEXT format
7	#
8	# A component of the Greenstone digital library software
9	# from the New Zealand Digital Library Project at the
10	# University of Waikato, New Zealand.
11	#
12	# Copyright (C) 1999 New Zealand Digital Library Project
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30	# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31	# PDF documents to HTML, and converts images to PNG format for display in
32	# the HTML pages generated
33
34	BEGIN {
35	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37	}
38
39	use parsargv;
40	use util;
41	use Cwd;
42	use File::Basename;
43
44	sub print_usage {
45	print STDERR
46	("pdftohtml version 0.22\n",
47	"Usage: pdftohtml [options] <PDF-file> <html-file>\n",
48	" -f <int> : first page to convert\n",
49	" -l <int> : last page to convert\n",
50	" -d <dir> : target directory (default: basename of pdf-file)\n",
51	" -o <file> : name of output file; - means stdout (default index.html)\n",
52	" -q : don't print any messages or errors\n",
53	" -h : print this usage information\n",
54	" -p : exchange .pdf links by .html\n",
55	" -c : generate complex HTML document\n",
56	" -F : don't use frames in HTML document\n",
57	" -i : ignore images\n",
58	" -e <string> : set extension for images (in the Html-file) (default png)\n"
59	);
60	exit (1);
61	}
62
63	sub main {
64	my (@ARGV) = @_;
65	my ($first,$last,$target_dir,$out_file,$img_ext,
66	$optq,$opth,$optp,$optc,$optF,$opti);
67
68	# read command-line arguments so that
69	# you can change the command in this script
70	if (!parsargv::parse(\@ARGV,
71	'f/\d+/1', \$first,
72	'l/\d+/1', \$last,
73	'd/[\S]*/', \$target_dir,
74	'o/[\S]*/', \$out_file,
75	'e/[\S]*/', \$img_ext,
76	'q', \$optq,
77	'h', \$opth,
78	'p', \$optp,
79	'c', \$optc,
80	'F', \$optF,
81	'i', \$opti
82	))
83	{
84	print_usage();
85	}
86
87	# Make sure the input file exists and can be opened for reading
88	if (scalar(@ARGV!=2)) {
89	print_usage();
90	}
91
92	my $input_filename = $ARGV[0];
93	my $output_filestem = $ARGV[1];
94	$output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
95
96	my @dir = split (/(\/\|\\)/, $input_filename);
97	pop(@dir);
98	my $dir = join ("", @dir);
99
100	if (!-r $input_filename) {
101	print STDERR "Error: unable to open $input_filename for reading\n";
102	exit(1);
103	}
104
105	# Heuristical code added by John McPherson to attempt to reject
106	# PDF's with no text in them.... based entirely on observation. We
107	# should really read the PDF specifications someday...
108	open (PDFIN, $input_filename) \|\|
109	die "Error: unable to open $input_filename for reading\n";
110
111	my $found_text_object=0;
112	my $num_objects=0;
113	my $non_text_objects=0;
114	my $unenc_stream_objects=0;
115	my $line;
116	while (!$found_text_object && ($_=<PDFIN>)) {
117	s/\r/\n/g;
118	if (/^\d+ \d+ obj/ms) {
119	# start of new object
120	my $object="";
121	$num_objects++;
122	while (! eof && ! /(>>\s*)?endobj/) {
123	$object.=$_;
124	$_=<PDFIN>;
125	}
126	if (!defined $_) {$_="";} # we've hit end of file in a funny place.
127	# we've got to the end of the current PDF object.
128	$object.=$_;
129
130	# remove newline chars, to help our pattern matching for whitespace
131	$object =~ s/\n/ /gs;
132
133	#determine object type...
134	$_=$object;
135
136	# for PDFWriter , and pdflatex and distill. Eg:
137	# "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
138	# Ie this looks like compressed text....
139	if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s.\s\/Filter/) {
140	$found_text_object=1;
141	}
142	# For pdflatex or ps2pdf from dvi->ps:
143	# if we are setting a font, then following object is probably text
144	# Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
145	elsif (/obj\s<<\s(\/ProcSet \[.+?\]\s*)?\/Font /s) {
146	$found_text_object=1;
147	}
148	# Unencoded streams. Eg
149	# "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
150	elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
151	{
152	$unenc_stream_objects++;
153	}
154	# (some) non-text objects
155	elsif (/<<.\/(Type).>>/s) {
156	$non_text_objects++;
157	}
158
159	} else { # not in an object...
160	# header? footer?
161	# print $_;
162	}
163	if ($found_text_object) {close PDFIN;}
164
165	} # end of while
166	close PDFIN;
167
168	# decide whether to accept or reject...
169	# some of these numbers are completely arbitrary based on a few .pdfs.
170	if ( ($found_text_object > 0) \|\|
171	($num_objects<=1500 && $unenc_stream_objects > 5)
172	)
173	{
174	# accept this .pdf. Currently do nothing except fall through...
175	} else {
176	# reject this .pdf.
177	print STDERR "pdftohtml.pl: $input_filename appears to have no ";
178	print STDERR "textual data. Aborting.\n";
179	# print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
180	exit(1);
181	}
182
183	# formulate the command
184	my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml.bin");
185
186	# don't include path on windows (to avoid having to play about
187	# with quoting when GSDLHOME might contain spaces) but assume
188	# that the PATH is set up correctly - note also that on windows
189	# we use pdftohtml.exe not pdftohtml.bin
190	$cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
191
192	if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
193	$cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
194	$cmd .= " > \"$output_filestem.out\"";
195
196	# attempting to redirect STDERR on windows 95/98 is a bad idea
197	$cmd .= " 2> \"$output_filestem.err\""
198	if $ENV{'GSDLOS'} !~ /^windows$/i;
199
200	# system() returns -1 if it can't run, otherwise it's $cmds ret val.
201	if (system($cmd)!=0) {
202	print STDERR "Error executing $cmd: $!\n";
203	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
204	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
205	return 0;
206	}
207
208	# Need to convert images from PPM format to PNG format
209	my @images;
210
211
212	open (IMAGES, "images.log");
213	while (<IMAGES>) {
214	push (@images, $_);
215	}
216	close IMAGES;
217
218	for $image (@images) {
219	chomp($image);
220	my $cmd = "";
221	if ($ENV{'GSDLOS'} =~ /^windows/i) {
222	$cmd = "pnmtopng $image";
223	if (system($cmd)!=0) {
224	print STDERR "Error executing $cmd\n";
225	return 0; # not sure about whether to leave this one in or take it out
226	}
227	} else {
228	my @nameparts = split(/\./, $image);
229	my $image_base = shift(@nameparts);
230
231	$cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
232	if (system($cmd)!=0) {
233	$cmd = "convert $image $image_base.png 2>/dev/null";
234	if (system($cmd)!=0) {
235	print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
236	return 0; # not sure about whether to leave this one in or take it out
237	}
238	}
239	}
240	&util::rm($image);
241	}
242
243	return 1;
244	}
245
246	&main(@ARGV);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: