source: main/trunk/greenstone2/bin/script/pdftohtml.pl@ 24600

Last change on this file since 24600 was 24600, checked in by ak19, 13 years ago

Added gs-magick.pl script which will set the environment for ImageMagick (including LD_LIBRARY_PATH) before launching the requested ImageMagick command and arguments. By setting the Imagemagick environment from this script we ensure that the modified env variables don't create conflicts with libraries needed for normal linux execution. All the Greenstone files in the *binary* that made direct calls to imagemagick now go through this script. The affected files are perl files in bin/script and perllib and Gatherer.java of GLI. (wvware has files that test for imagemagick during compilation stage, which is independent of our changs which are only for users running imagemagick from a GS binary.) The final problems were related to how different perl files made use of the return values and the output of running their imagemagick command: they would query the 127 and/or and/or run the command with backtick operators to get the output printed to STDOUT. By inserting an intermediate gs-magick.pl file, needed to ensure that the exit code stored in 127 would at least be passed on correctly, as is necessary when testing the exit code against non-zero values or greater/less than zero (instead of comparing them with equals/not equal to 0). To get the correct exit code as emitted by imagemagick, calling code needs to shift bits in 127 and converting it to a signed value.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert PDF documents to HTML format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 2001 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45# note - we don't actually ever use most of these options...
46print STDERR
47 ("pdftohtml.pl wrapper for pdftohtml.\n",
48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49 "Options:\n",
50 "\t-i\tignore images (don't extract)\n",
51 "\t-a\tallow images only (continue even if no text is present)\n",
52 "\t-c\tproduce complex output (requires ghostscript)\n",
53 "\t-hidden\tExtract hidden text\n",
54 "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
55 );
56exit (1);
57}
58
59sub main {
60 my (@ARGV) = @_;
61 my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
62
63 # read command-line arguments so that
64 # you can change the command in this script
65 if (!parsargv::parse(\@ARGV,
66 'a', \$allow_no_text,
67 'i', \$ignore_images,
68 'c', \$complex,
69 'hidden', \$hidden,
70 'zoom/\d+/2', \$zoom,
71 ))
72 {
73 print_usage();
74 }
75
76 # Make sure the input file exists and can be opened for reading
77 if (scalar(@ARGV) != 2) {
78 print_usage();
79 }
80
81 my $input_filename = $ARGV[0];
82 my $output_filestem = $ARGV[1];
83
84 $output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
85
86 # test that the directories exist to create the output file, or
87 # we should exit immediately. (File:: is included by util.pm)
88 my $output_dir = File::Basename::dirname($output_filestem);
89 if (! -d $output_dir || ! -w $output_dir) {
90 die "pdftohtml.pl: cannot write to directory $output_dir\n";
91 }
92
93 my @dir = split (/(\/|\\)/, $input_filename);
94 my $input_basename = pop(@dir);
95 $input_basename =~ s/\.pdf//i;
96 my $dir = join ("", @dir);
97
98 if (!-r $input_filename) {
99 print STDERR "Error: unable to open $input_filename for reading\n";
100 exit(1);
101 }
102
103 # Heuristical code removed due to pdftohtml being "fixed" to not
104 # create bitmaps for each char in some pdfs. However, this means we
105 # now create .html files even if we can't extract any text. We should
106 # check for that now instead someday...
107
108
109 # formulate the command
110 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
111
112 # don't include path on windows (to avoid having to play about
113 # with quoting when GSDLHOME might contain spaces) but assume
114 # that the PATH is set up correctly.
115 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
116
117 $cmd .= " -i" if ($ignore_images);
118 $cmd .= " -c" if ($complex);
119 $cmd .= " -hidden" if ($hidden);
120 $cmd .= " -zoom $zoom";
121 $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
122
123# system() returns -1 if it can't run, otherwise it's $cmds ret val.
124 # note we return 0 if the file is "encrypted"
125 $!=0;
126 if (system($cmd)!=0) {
127 print STDERR "pdftohtml error for $input_filename $!\n";
128 # leave these for gsConvert.pl...
129 #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
130 #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
131 return 1;
132 }
133
134 if (! -e "$output_filestem.html") {
135 return 1;
136 }
137
138# post-process to remove </b><b> and </i><i>, as these break up
139# words, screwing up indexing and searching.
140# At the same time, check that our .html file has some textual content.
141 &util::mv("$output_filestem.html","$output_filestem.html.tmp");
142 $!=0;
143 open INFILE, "$output_filestem.html.tmp" ||
144 die "Couldn't open file: $!";
145 open OUTFILE, ">$output_filestem.html" ||
146 die "Couldn't open file for writing: $!";
147 my $line;
148 my $seen_textual_content=$allow_no_text;
149 # check for unicode byte-order marker at the start of the file
150 $line = <INFILE>;
151 $line =~ s#\376\377##g;
152 while ($line) {
153 $line =~ s#</b><b>##g;
154 $line =~ s#</i><i>##g;
155 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
156# check for any extracted text
157 if ($seen_textual_content == 0) {
158 my $tmp_line=$line;
159 $tmp_line =~ s/<[^>]*>//g;
160 $tmp_line =~ s/Page\s\d+//;
161 $tmp_line =~ s/\s*//g;
162 if ($tmp_line ne "") {
163 $seen_textual_content=1;
164 }
165 # special - added to remove the filename from the title
166 # this should be in the header, before we see "textual content"
167 if ($line =~ m@<title>(.*?)</title>@i) {
168 my $title=$1;
169
170 # is this title the name of a filename?
171 if (-r "$title.pdf" || -r "$title.html") {
172 # remove the title
173 $line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
174 }
175 }
176 }
177
178 # relative hrefs to own document...
179 $line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
180# escape underscores, but not if they're inside tags (eg img/href names)
181 my $inatag = 0; # allow multi-line tags
182 if ($line =~ /_/) {
183 my @parts=split('_',$line);
184 my $lastpart=pop @parts;
185 foreach my $part (@parts) {
186 if ($part =~ /<[^>]*$/) { # if we're starting a tag...
187 $inatag=1;
188 } elsif ($part =~ />[^<]*$/) { # closing a tag
189 $inatag=0;
190 }
191 if ($inatag) {
192 $part.='_';
193 } else {
194 $part.="&#95;";
195 }
196 }
197 $line=join('',@parts,$lastpart);
198 }
199
200 print OUTFILE $line;
201 $line = <INFILE>;
202 }
203 close INFILE;
204 close OUTFILE;
205 &util::rm("$output_filestem.html.tmp");
206
207 # Need to convert images from PPM format to PNG format
208 my @images;
209
210 my $directory=$output_filestem;
211 $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
212 # newer versions of pdftohtml don't seem to do images this way anymore?
213 if (open (IMAGES, "${directory}images.log") ||
214 open (IMAGES, "${directory}image.log")) {
215 while (<IMAGES>) {
216 push (@images, $_);
217 }
218 close IMAGES;
219 &util::rm("${directory}image.log") if (-e "${directory}image.log");
220
221 }
222
223 # no need to go any further if there is no text extracted from pdf.
224 if ($seen_textual_content == 0) {
225 print STDERR "Error: PDF contains no extractable text\n";
226 # remove images...
227 for $image (@images) {
228 chomp($image);
229 &util::rm("${directory}$image");
230 }
231 return 1;
232 }
233
234
235
236 for $image (@images) {
237 chomp($image);
238 my $cmd = "";
239 if ($ENV{'GSDLOS'} =~ /^windows/i) {
240 $cmd = "pnmtopng \"${directory}$image\"";
241 if (system($cmd)!=0) {
242 print STDERR "Error executing $cmd\n";
243 #return 1; # not sure about whether to leave this one in or take it out
244 next;
245 }
246 } else {
247 my @nameparts = split(/\./, $image);
248 my $image_base = shift(@nameparts);
249 $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
250 if (system($cmd)!=0) {
251 $cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
252 if (system($cmd)!=0) {
253 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
254 #return 1; # not sure about whether to leave this one in or take it out
255 next;
256 }
257 }
258 }
259 &util::rm($image);
260 }
261
262 return 0;
263}
264
265# indicate our error status, 0 = success
266exit (&main(@ARGV));
267
Note: See TracBrowser for help on using the repository browser.