source: trunk/gsdl/bin/script/pdftohtml.pl@ 7586

Last change on this file since 7586 was 7586, checked in by kjdon, 20 years ago

if we remove the title cos it matches a filename, then we add in a meta tag with orig-title - this makes sure that the generated html files are not identical even when the pdfs have no text (if they are identical they all get the same hash id and end up overwriting each other in the archives dir

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert PDF documents to HTML format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 2001 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45# note - we don't actually ever use most of these options...
46print STDERR
47 ("pdftohtml.pl wrapper for pdftohtml.\n",
48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49 "Options:\n",
50 "\t-i\tignore images (don't extract)\n",
51 "\t-a\tallow images only (continue even if no text is present)\n",
52 "\t-c\tproduce complex output (requires ghostscript)\n",
53 "\t-hidden\tExtract hidden text\n",
54 "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
55 );
56exit (1);
57}
58
59sub main {
60 my (@ARGV) = @_;
61 my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
62
63 # read command-line arguments so that
64 # you can change the command in this script
65 if (!parsargv::parse(\@ARGV,
66 'a', \$allow_no_text,
67 'i', \$ignore_images,
68 'c', \$complex,
69 'hidden', \$hidden,
70 'zoom/\d+/2', \$zoom,
71 ))
72 {
73 print_usage();
74 }
75
76 # Make sure the input file exists and can be opened for reading
77 if (scalar(@ARGV) != 2) {
78 print_usage();
79 }
80
81 my $input_filename = $ARGV[0];
82 my $output_filestem = $ARGV[1];
83
84 $output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
85
86 # test that the directories exist to create the output file, or
87 # we should exit immediately. (File:: is included by util.pm)
88 my $output_dir = File::Basename::dirname($output_filestem);
89 if (! -d $output_dir || ! -w $output_dir) {
90 die "pdftohtml.pl: cannot write to directory $output_dir\n";
91 }
92
93 my @dir = split (/(\/|\\)/, $input_filename);
94 my $input_basename = pop(@dir);
95 $input_basename =~ s/\.pdf//i;
96 my $dir = join ("", @dir);
97
98 if (!-r $input_filename) {
99 print STDERR "Error: unable to open $input_filename for reading\n";
100 exit(1);
101 }
102
103 # Heuristical code removed due to pdftohtml being "fixed" to not
104 # create bitmaps for each char in some pdfs. However, this means we
105 # now create .html files even if we can't extract any text. We should
106 # check for that now instead someday...
107
108
109 # formulate the command
110 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
111
112 # don't include path on windows (to avoid having to play about
113 # with quoting when GSDLHOME might contain spaces) but assume
114 # that the PATH is set up correctly.
115 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
116
117 $cmd .= " -i" if ($ignore_images);
118 $cmd .= " -c" if ($complex);
119 $cmd .= " -hidden" if ($hidden);
120 $cmd .= " -zoom $zoom";
121 $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
122
123# system() returns -1 if it can't run, otherwise it's $cmds ret val.
124 # note we return 0 if the file is "encrypted"
125 $!=0;
126 if (system($cmd)!=0) {
127 print STDERR "pdftohtml error for $input_filename $!\n";
128 # leave these for gsConvert.pl...
129 #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
130 #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
131 return 1;
132 }
133
134 if (! -e "$output_filestem.html") {
135 return 1;
136 }
137
138# post-process to remove </b><b> and </i><i>, as these break up
139# words, screwing up indexing and searching.
140# At the same time, check that our .html file has some textual content.
141 &util::mv("$output_filestem.html","$output_filestem.html.tmp");
142 $!=0;
143 open INFILE, "$output_filestem.html.tmp" ||
144 die "Couldn't open file: $!";
145 open OUTFILE, ">$output_filestem.html" ||
146 die "Couldn't open file for writing: $!";
147 my $line;
148 my $seen_textual_content=$allow_no_text;
149 # check for unicode byte-order marker at the start of the file
150 $line = <INFILE>;
151 $line =~ s#\376\377##g;
152 while ($line) {
153 $line =~ s#</b><b>##g;
154 $line =~ s#</i><i>##g;
155 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
156# check for any extracted text
157 if ($seen_textual_content == 0) {
158 my $tmp_line=$line;
159 $tmp_line =~ s/<[^>]*>//g;
160 $tmp_line =~ s/Page\s\d+//;
161 $tmp_line =~ s/\s*//g;
162 if ($tmp_line ne "") {
163 $seen_textual_content=1;
164 }
165 # special - added to remove the filename from the title
166 # this should be in the header, before we see "textual content"
167 if ($line =~ m@<title>(.*?)</title>@) {
168 my $title=$1;
169
170 # is this title the name of a filename?
171 if (-r "$title.pdf" || -r "$title.html") {
172 # remove the title
173 $line =~ s@<title>.*?</title>@<title></title><META NAME=\"Orig-title\" CONTENT=\"$title\">@;
174 }
175 }
176 }
177
178 # relative hrefs to own document...
179 $line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
180# escape underscores, but not if they're inside tags (eg img/href names)
181 my $inatag = 0; # allow multi-line tags
182 if ($line =~ /_/) {
183 my @parts=split('_',$line);
184 my $lastpart=pop @parts;
185 foreach my $part (@parts) {
186 if ($part =~ /<[^>]*$/) { # if we're starting a tag...
187 $inatag=1;
188 } elsif ($part =~ />[^<]*$/) { # closing a tag
189 $inatag=0;
190 }
191 if ($inatag) {
192 $part.='_';
193 } else {
194 $part.="&#95;";
195 }
196 }
197 $line=join('',@parts,$lastpart);
198 }
199
200 print OUTFILE $line;
201 $line = <INFILE>;
202 }
203 close INFILE;
204 close OUTFILE;
205 &util::rm("$output_filestem.html.tmp");
206
207 # Need to convert images from PPM format to PNG format
208 my @images;
209
210 my $directory=$output_filestem;
211 $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
212 # newer versions of pdftohtml don't seem to do images this way anymore?
213 if (open (IMAGES, "${directory}images.log") ||
214 open (IMAGES, "${directory}image.log")) {
215 while (<IMAGES>) {
216 push (@images, $_);
217 }
218 close IMAGES;
219 &util::rm("${directory}image.log") if (-e "${directory}image.log");
220
221 }
222
223 # no need to go any further if there is no text extracted from pdf.
224 if ($seen_textual_content == 0) {
225 print STDERR "Error: PDF contains no extractable text\n";
226 # remove images...
227 for $image (@images) {
228 chomp($image);
229 &util::rm("${directory}$image");
230 }
231 return 1;
232 }
233
234
235
236 for $image (@images) {
237 chomp($image);
238 my $cmd = "";
239 if ($ENV{'GSDLOS'} =~ /^windows/i) {
240 $cmd = "pnmtopng \"${directory}$image\"";
241 if (system($cmd)!=0) {
242 print STDERR "Error executing $cmd\n";
243 #return 1; # not sure about whether to leave this one in or take it out
244 next;
245 }
246 } else {
247 my @nameparts = split(/\./, $image);
248 my $image_base = shift(@nameparts);
249 $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
250 if (system($cmd)!=0) {
251 $cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
252 if (system($cmd)!=0) {
253 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
254 #return 1; # not sure about whether to leave this one in or take it out
255 next;
256 }
257 }
258 }
259 &util::rm($image);
260 }
261
262 return 0;
263}
264
265# indicate our error status, 0 = success
266exit (&main(@ARGV));
267
Note: See TracBrowser for help on using the repository browser.