1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 |
|
---|
4 | ###########################################################################
|
---|
5 | #
|
---|
6 | # pdftohtml.pl -- convert PDF documents to HTML format
|
---|
7 | #
|
---|
8 | # A component of the Greenstone digital library software
|
---|
9 | # from the New Zealand Digital Library Project at the
|
---|
10 | # University of Waikato, New Zealand.
|
---|
11 | #
|
---|
12 | # Copyright (C) 2001 New Zealand Digital Library Project
|
---|
13 | #
|
---|
14 | # This program is free software; you can redistribute it and/or modify
|
---|
15 | # it under the terms of the GNU General Public License as published by
|
---|
16 | # the Free Software Foundation; either version 2 of the License, or
|
---|
17 | # (at your option) any later version.
|
---|
18 | #
|
---|
19 | # This program is distributed in the hope that it will be useful,
|
---|
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
22 | # GNU General Public License for more details.
|
---|
23 | #
|
---|
24 | # You should have received a copy of the GNU General Public License
|
---|
25 | # along with this program; if not, write to the Free Software
|
---|
26 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
27 | #
|
---|
28 | ###########################################################################
|
---|
29 |
|
---|
30 | # pdftohtml.pl is a wrapper for running pdftohtml utility which converts
|
---|
31 | # PDF documents to HTML, and converts images to PNG format for display in
|
---|
32 | # the HTML pages generated
|
---|
33 |
|
---|
34 | BEGIN {
|
---|
35 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
36 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
37 | }
|
---|
38 |
|
---|
39 | use parsargv;
|
---|
40 | use util;
|
---|
41 | use Cwd;
|
---|
42 | use File::Basename;
|
---|
43 |
|
---|
44 | sub print_usage {
|
---|
45 | # note - we don't actually ever use most of these options...
|
---|
46 | print STDERR
|
---|
47 | ("pdftohtml.pl wrapper for pdftohtml.\n",
|
---|
48 | "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
|
---|
49 | "Options:\n",
|
---|
50 | "\t-i\tignore images (don't extract)\n",
|
---|
51 | "\t-a\tallow images only (continue even if no text is present)\n",
|
---|
52 | "\t-c\tproduce complex output (requires ghostscript)\n",
|
---|
53 | "\t-hidden\tExtract hidden text\n",
|
---|
54 | "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
|
---|
55 | );
|
---|
56 | exit (1);
|
---|
57 | }
|
---|
58 |
|
---|
59 | sub main {
|
---|
60 | my (@ARGV) = @_;
|
---|
61 | my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
|
---|
62 |
|
---|
63 | # read command-line arguments so that
|
---|
64 | # you can change the command in this script
|
---|
65 | if (!parsargv::parse(\@ARGV,
|
---|
66 | 'a', \$allow_no_text,
|
---|
67 | 'i', \$ignore_images,
|
---|
68 | 'c', \$complex,
|
---|
69 | 'hidden', \$hidden,
|
---|
70 | 'zoom/\d+/2', \$zoom,
|
---|
71 | ))
|
---|
72 | {
|
---|
73 | print_usage();
|
---|
74 | }
|
---|
75 |
|
---|
76 | # Make sure the input file exists and can be opened for reading
|
---|
77 | if (scalar(@ARGV) != 2) {
|
---|
78 | print_usage();
|
---|
79 | }
|
---|
80 |
|
---|
81 | my $input_filename = $ARGV[0];
|
---|
82 | my $output_filestem = $ARGV[1];
|
---|
83 |
|
---|
84 | $output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
|
---|
85 |
|
---|
86 | # test that the directories exist to create the output file, or
|
---|
87 | # we should exit immediately. (File:: is included by util.pm)
|
---|
88 | my $output_dir = File::Basename::dirname($output_filestem);
|
---|
89 | if (! -d $output_dir || ! -w $output_dir) {
|
---|
90 | die "pdftohtml.pl: cannot write to directory $output_dir\n";
|
---|
91 | }
|
---|
92 |
|
---|
93 | my @dir = split (/(\/|\\)/, $input_filename);
|
---|
94 | my $input_basename = pop(@dir);
|
---|
95 | $input_basename =~ s/\.pdf//i;
|
---|
96 | my $dir = join ("", @dir);
|
---|
97 |
|
---|
98 | if (!-r $input_filename) {
|
---|
99 | print STDERR "Error: unable to open $input_filename for reading\n";
|
---|
100 | exit(1);
|
---|
101 | }
|
---|
102 |
|
---|
103 | # Heuristical code removed due to pdftohtml being "fixed" to not
|
---|
104 | # create bitmaps for each char in some pdfs. However, this means we
|
---|
105 | # now create .html files even if we can't extract any text. We should
|
---|
106 | # check for that now instead someday...
|
---|
107 |
|
---|
108 |
|
---|
109 | # formulate the command
|
---|
110 | my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
|
---|
111 |
|
---|
112 | # don't include path on windows (to avoid having to play about
|
---|
113 | # with quoting when GSDLHOME might contain spaces) but assume
|
---|
114 | # that the PATH is set up correctly.
|
---|
115 | $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
|
---|
116 |
|
---|
117 | $cmd .= " -i" if ($ignore_images);
|
---|
118 | $cmd .= " -c" if ($complex);
|
---|
119 | $cmd .= " -hidden" if ($hidden);
|
---|
120 | $cmd .= " -zoom $zoom";
|
---|
121 | $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
|
---|
122 |
|
---|
123 | # system() returns -1 if it can't run, otherwise it's $cmds ret val.
|
---|
124 | # note we return 0 if the file is "encrypted"
|
---|
125 | $!=0;
|
---|
126 | if (system($cmd)!=0) {
|
---|
127 | print STDERR "pdftohtml error for $input_filename $!\n";
|
---|
128 | # leave these for gsConvert.pl...
|
---|
129 | #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
|
---|
130 | #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
|
---|
131 | return 1;
|
---|
132 | }
|
---|
133 |
|
---|
134 | if (! -e "$output_filestem.html") {
|
---|
135 | return 1;
|
---|
136 | }
|
---|
137 |
|
---|
138 | # post-process to remove </b><b> and </i><i>, as these break up
|
---|
139 | # words, screwing up indexing and searching.
|
---|
140 | # At the same time, check that our .html file has some textual content.
|
---|
141 | &util::mv("$output_filestem.html","$output_filestem.html.tmp");
|
---|
142 | $!=0;
|
---|
143 | open INFILE, "$output_filestem.html.tmp" ||
|
---|
144 | die "Couldn't open file: $!";
|
---|
145 | open OUTFILE, ">$output_filestem.html" ||
|
---|
146 | die "Couldn't open file for writing: $!";
|
---|
147 | my $line;
|
---|
148 | my $seen_textual_content=$allow_no_text;
|
---|
149 | # check for unicode byte-order marker at the start of the file
|
---|
150 | $line = <INFILE>;
|
---|
151 | $line =~ s#\376\377##g;
|
---|
152 | while ($line) {
|
---|
153 | $line =~ s#</b><b>##g;
|
---|
154 | $line =~ s#</i><i>##g;
|
---|
155 | $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
|
---|
156 | # check for any extracted text
|
---|
157 | if ($seen_textual_content == 0) {
|
---|
158 | my $tmp_line=$line;
|
---|
159 | $tmp_line =~ s/<[^>]*>//g;
|
---|
160 | $tmp_line =~ s/Page\s\d+//;
|
---|
161 | $tmp_line =~ s/\s*//g;
|
---|
162 | if ($tmp_line ne "") {
|
---|
163 | $seen_textual_content=1;
|
---|
164 | }
|
---|
165 | # special - added to remove the filename from the title
|
---|
166 | # this should be in the header, before we see "textual content"
|
---|
167 | if ($line =~ m@<title>(.*?)</title>@i) {
|
---|
168 | my $title=$1;
|
---|
169 |
|
---|
170 | # is this title the name of a filename?
|
---|
171 | if (-r "$title.pdf" || -r "$title.html") {
|
---|
172 | # remove the title
|
---|
173 | $line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
|
---|
174 | }
|
---|
175 | }
|
---|
176 | }
|
---|
177 |
|
---|
178 | # relative hrefs to own document...
|
---|
179 | $line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
|
---|
180 | # escape underscores, but not if they're inside tags (eg img/href names)
|
---|
181 | my $inatag = 0; # allow multi-line tags
|
---|
182 | if ($line =~ /_/) {
|
---|
183 | my @parts=split('_',$line);
|
---|
184 | my $lastpart=pop @parts;
|
---|
185 | foreach my $part (@parts) {
|
---|
186 | if ($part =~ /<[^>]*$/) { # if we're starting a tag...
|
---|
187 | $inatag=1;
|
---|
188 | } elsif ($part =~ />[^<]*$/) { # closing a tag
|
---|
189 | $inatag=0;
|
---|
190 | }
|
---|
191 | if ($inatag) {
|
---|
192 | $part.='_';
|
---|
193 | } else {
|
---|
194 | $part.="_";
|
---|
195 | }
|
---|
196 | }
|
---|
197 | $line=join('',@parts,$lastpart);
|
---|
198 | }
|
---|
199 |
|
---|
200 | print OUTFILE $line;
|
---|
201 | $line = <INFILE>;
|
---|
202 | }
|
---|
203 | close INFILE;
|
---|
204 | close OUTFILE;
|
---|
205 | &util::rm("$output_filestem.html.tmp");
|
---|
206 |
|
---|
207 | # Need to convert images from PPM format to PNG format
|
---|
208 | my @images;
|
---|
209 |
|
---|
210 | my $directory=$output_filestem;
|
---|
211 | $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
|
---|
212 | # newer versions of pdftohtml don't seem to do images this way anymore?
|
---|
213 | if (open (IMAGES, "${directory}images.log") ||
|
---|
214 | open (IMAGES, "${directory}image.log")) {
|
---|
215 | while (<IMAGES>) {
|
---|
216 | push (@images, $_);
|
---|
217 | }
|
---|
218 | close IMAGES;
|
---|
219 | &util::rm("${directory}image.log") if (-e "${directory}image.log");
|
---|
220 |
|
---|
221 | }
|
---|
222 |
|
---|
223 | # no need to go any further if there is no text extracted from pdf.
|
---|
224 | if ($seen_textual_content == 0) {
|
---|
225 | print STDERR "Error: PDF contains no extractable text\n";
|
---|
226 | # remove images...
|
---|
227 | for $image (@images) {
|
---|
228 | chomp($image);
|
---|
229 | &util::rm("${directory}$image");
|
---|
230 | }
|
---|
231 | return 1;
|
---|
232 | }
|
---|
233 |
|
---|
234 |
|
---|
235 |
|
---|
236 | for $image (@images) {
|
---|
237 | chomp($image);
|
---|
238 | my $cmd = "";
|
---|
239 | if ($ENV{'GSDLOS'} =~ /^windows/i) {
|
---|
240 | $cmd = "pnmtopng \"${directory}$image\"";
|
---|
241 | if (system($cmd)!=0) {
|
---|
242 | print STDERR "Error executing $cmd\n";
|
---|
243 | #return 1; # not sure about whether to leave this one in or take it out
|
---|
244 | next;
|
---|
245 | }
|
---|
246 | } else {
|
---|
247 | my @nameparts = split(/\./, $image);
|
---|
248 | my $image_base = shift(@nameparts);
|
---|
249 | $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
|
---|
250 | if (system($cmd)!=0) {
|
---|
251 | $cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
|
---|
252 | if (system($cmd)!=0) {
|
---|
253 | print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
|
---|
254 | #return 1; # not sure about whether to leave this one in or take it out
|
---|
255 | next;
|
---|
256 | }
|
---|
257 | }
|
---|
258 | }
|
---|
259 | &util::rm($image);
|
---|
260 | }
|
---|
261 |
|
---|
262 | return 0;
|
---|
263 | }
|
---|
264 |
|
---|
265 | # indicate our error status, 0 = success
|
---|
266 | exit (&main(@ARGV));
|
---|
267 |
|
---|