source: trunk/gsdl/bin/script/pdftohtml.pl@ 2346

Last change on this file since 2346 was 2346, checked in by jrm21, 23 years ago

newer version of pdftohtml from upstream uses image.log instead of images.log,
so we check for both (for now...) as I am using some features from v0.31 in
our current pdftohtml (based on v0.22).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert documents to HTML ot TEXT format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 1999 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45print STDERR
46 ("pdftohtml version 0.22\n",
47 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
48 " -f <int> : first page to convert\n",
49 " -l <int> : last page to convert\n",
50 " -d <dir> : target directory (default: basename of pdf-file)\n",
51 " -o <file> : name of output file; - means stdout (default index.html)\n",
52 " -q : don't print any messages or errors\n",
53 " -h : print this usage information\n",
54 " -p : exchange .pdf links by .html\n",
55 " -c : generate complex HTML document\n",
56 " -F : don't use frames in HTML document\n",
57 " -i : ignore images\n",
58 " -e <string> : set extension for images (in the Html-file) (default png)\n"
59 );
60exit (1);
61}
62
63sub main {
64 my (@ARGV) = @_;
65 my ($first,$last,$target_dir,$out_file,$img_ext,
66 $optq,$opth,$optp,$optc,$optF,$opti);
67
68 # read command-line arguments so that
69 # you can change the command in this script
70 if (!parsargv::parse(\@ARGV,
71 'f/\d+/1', \$first,
72 'l/\d+/1', \$last,
73 'd/[\S]*/', \$target_dir,
74 'o/[\S]*/', \$out_file,
75 'e/[\S]*/', \$img_ext,
76 'q', \$optq,
77 'h', \$opth,
78 'p', \$optp,
79 'c', \$optc,
80 'F', \$optF,
81 'i', \$opti
82 ))
83 {
84 print_usage();
85 }
86
87 # Make sure the input file exists and can be opened for reading
88 if (scalar(@ARGV!=2)) {
89 print_usage();
90 }
91
92 my $input_filename = $ARGV[0];
93 my $output_filestem = $ARGV[1];
94 $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
95
96 my @dir = split (/(\/|\\)/, $input_filename);
97 pop(@dir);
98 my $dir = join ("", @dir);
99
100 if (!-r $input_filename) {
101 print STDERR "Error: unable to open $input_filename for reading\n";
102 exit(1);
103 }
104
105 # Heuristical code added by John McPherson to attempt to reject
106 # PDF's with no text in them.... based entirely on observation. We
107 # should really read the PDF specifications someday...
108 open (PDFIN, $input_filename) ||
109 die "Error: unable to open $input_filename for reading\n";
110
111 my $found_text_object=0;
112 my $num_objects=0;
113 my $non_text_objects=0;
114 my $unenc_stream_objects=0;
115 my $line;
116 while (!$found_text_object && ($_=<PDFIN>)) {
117 s/\r/\n/g;
118 if (/^\d+ \d+ obj/ms) {
119 # start of new object
120 my $object="";
121 $num_objects++;
122 while (! eof && ! /(>>\s*)?endobj/) {
123 $object.=$_;
124 $_=<PDFIN>;
125 }
126 if (!defined $_) {$_="";} # we've hit end of file in a funny place.
127 # we've got to the end of the current PDF object.
128 $object.=$_;
129
130 # remove newline chars, to help our pattern matching for whitespace
131 $object =~ s/\n/ /gs;
132
133 #determine object type...
134 $_=$object;
135
136# for PDFWriter , and pdflatex and distill. Eg:
137# "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
138# Ie this looks like compressed text....
139 if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {
140 $found_text_object=1;
141 }
142 # For pdflatex or ps2pdf from dvi->ps:
143 # if we are setting a font, then following object is probably text
144 # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
145 elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) {
146 $found_text_object=1;
147 }
148 # Unencoded streams. Eg
149 # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
150 elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
151 {
152 $unenc_stream_objects++;
153 }
154 # (some) non-text objects
155 elsif (/<<.*\/(Type).*>>/s) {
156 $non_text_objects++;
157 }
158
159 } else { # not in an object...
160 # header? footer?
161# print $_;
162 }
163 if ($found_text_object) {close PDFIN;}
164
165 } # end of while
166 close PDFIN;
167
168 # decide whether to accept or reject...
169 # some of these numbers are completely arbitrary based on a few .pdfs.
170 if ( ($found_text_object > 0) ||
171 ($num_objects<=1500 && $unenc_stream_objects > 5)
172 )
173 {
174 # accept this .pdf. Currently do nothing except fall through...
175 } else {
176 # reject this .pdf.
177 print STDERR "pdftohtml.pl: $input_filename appears to have no ";
178 print STDERR "textual data. Aborting.\n";
179 # print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
180 exit(1);
181 }
182
183 # formulate the command
184 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml.bin");
185
186 # don't include path on windows (to avoid having to play about
187 # with quoting when GSDLHOME might contain spaces) but assume
188 # that the PATH is set up correctly - note also that on windows
189 # we use pdftohtml.exe not pdftohtml.bin
190 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
191
192 if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
193 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
194 $cmd .= " > \"$output_filestem.out\"";
195
196 # attempting to redirect STDERR on windows 95/98 is a bad idea
197 $cmd .= " 2> \"$output_filestem.err\""
198 if $ENV{'GSDLOS'} !~ /^windows$/i;
199
200# system() returns -1 if it can't run, otherwise it's $cmds ret val.
201 if (system($cmd)!=0) {
202 print STDERR "Error executing $cmd: $!\n";
203 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
204 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
205 return 0;
206 }
207
208 # Need to convert images from PPM format to PNG format
209 my @images;
210
211
212 open (IMAGES, "images.log") ||
213 open (IMAGES, "image.log") ||
214 print STDERR "Error opening image log:$!\n";
215 while (<IMAGES>) {
216 push (@images, $_);
217 }
218 close IMAGES;
219
220 for $image (@images) {
221 chomp($image);
222 my $cmd = "";
223 if ($ENV{'GSDLOS'} =~ /^windows/i) {
224 $cmd = "pnmtopng $image";
225 if (system($cmd)!=0) {
226 print STDERR "Error executing $cmd\n";
227 return 0; # not sure about whether to leave this one in or take it out
228 }
229 } else {
230 my @nameparts = split(/\./, $image);
231 my $image_base = shift(@nameparts);
232
233 $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
234 if (system($cmd)!=0) {
235 $cmd = "convert $image $image_base.png 2>/dev/null";
236 if (system($cmd)!=0) {
237 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
238 return 0; # not sure about whether to leave this one in or take it out
239 }
240 }
241 }
242 &util::rm($image);
243 }
244
245 return 1;
246}
247
248&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.