source: trunk/gsdl/bin/script/pdftohtml.pl@ 2977

Last change on this file since 2977 was 2976, checked in by jrm21, 22 years ago

minor change to test of @ARGV - scalar had brackets in wrong place.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert PDF documents to HTML format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 2001 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45# note - we don't actually ever use most of these options...
46print STDERR
47 ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",
48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49 "Options:\n",
50 "\t-i\tignore images (don't extract)\n",
51 "\t-a\tallow images only (continue even if no text is present)\n"
52 );
53exit (1);
54}
55
56sub main {
57 my (@ARGV) = @_;
58 my ($allow_no_text,$ignore_images);
59
60 # read command-line arguments so that
61 # you can change the command in this script
62 if (!parsargv::parse(\@ARGV,
63 'a', \$allow_no_text,
64 'i', \$ignore_images
65 ))
66 {
67 print_usage();
68 }
69
70 # Make sure the input file exists and can be opened for reading
71 if (scalar(@ARGV) != 2) {
72 print_usage();
73 }
74
75 my $input_filename = $ARGV[0];
76 my $output_filestem = $ARGV[1];
77 $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
78
79 my @dir = split (/(\/|\\)/, $input_filename);
80 pop(@dir);
81 my $dir = join ("", @dir);
82
83 if (!-r $input_filename) {
84 print STDERR "Error: unable to open $input_filename for reading\n";
85 exit(1);
86 }
87
88 # Heuristical code removed due to pdftohtml being "fixed" to not
89 # create bitmaps for each char in some pdfs. However, this means we
90 # now create .html files even if we can't extract any text. We should
91 # check for that now instead someday...
92
93
94 # formulate the command
95 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
96
97 # don't include path on windows (to avoid having to play about
98 # with quoting when GSDLHOME might contain spaces) but assume
99 # that the PATH is set up correctly.
100 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
101
102 $cmd .= " -i" if ($ignore_images);
103 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
104
105# system() returns -1 if it can't run, otherwise it's $cmds ret val.
106 # note we return 0 if the file is "encrypted"
107 $!=0;
108 if (system($cmd)!=0) {
109 print STDERR "pdftohtml error for $input_filename $!\n";
110 # leave these for gsConvert.pl...
111 #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
112 #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
113 return 1;
114 }
115
116 if (! -e "$output_filestem.html") {
117 return 1;
118 }
119
120# post-process to remove </b><b> and </i><i>, as these break up
121# words, screwing up indexing and searching.
122# At the same time, check that our .html file has some textual content.
123 &util::mv("$output_filestem.html","$output_filestem.html.tmp");
124 $!=0;
125 open INFILE, "$output_filestem.html.tmp" ||
126 die "Couldn't open file: $!";
127 open OUTFILE, ">$output_filestem.html" ||
128 die "Couldn't open file for writing: $!";
129 my $line;
130 my $seen_textual_content=$allow_no_text;
131 while ($line=<INFILE>) {
132 $line =~ s#</b><b>##g;
133 $line =~ s#</i><i>##g;
134 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
135# check for any extracted text
136 if ($seen_textual_content == 0) {
137 my $tmp_line=$line;
138 $tmp_line =~ s/<[^>]*>//g;
139 $tmp_line =~ s/Page\s\d+//;
140 $tmp_line =~ s/\s*//g;
141 if ($tmp_line ne "") {
142 $seen_textual_content=1;
143 }
144 }
145
146# escape underscores, but not if they're inside tags (eg img/href names)
147 my $inatag = 0; # allow multi-line tags
148 if ($line =~ /_/) {
149 my @parts=split('_',$line);
150 my $lastpart=pop @parts;
151 foreach my $part (@parts) {
152 if ($part =~ /<[^>]*$/) { # if we're starting a tag...
153 $inatag=1;
154 } elsif ($part =~ />[^<]*$/) { # closing a tag
155 $inatag=0;
156 }
157 if ($inatag) {
158 $part.='_';
159 } else {
160 $part.="&#95;";
161 }
162 }
163 $line=join('',@parts,$lastpart);
164 }
165
166 print OUTFILE $line;
167 }
168 close INFILE;
169 close OUTFILE;
170 &util::rm("$output_filestem.html.tmp");
171
172 # Need to convert images from PPM format to PNG format
173 my @images;
174
175 my $directory=$output_filestem;
176 $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
177
178 open (IMAGES, "${directory}images.log") ||
179 open (IMAGES, "${directory}image.log") ||
180 print STDERR "Error opening image log:$!\n";
181 while (<IMAGES>) {
182 push (@images, $_);
183 }
184 close IMAGES;
185 &util::rm("${directory}image.log") if (-e "${directory}image.log");
186
187 # no need to go any further if there is no text extracted from pdf.
188 if ($seen_textual_content == 0) {
189 print STDERR "Error: PDF contains no extractable text\n";
190 # remove images...
191 for $image (@images) {
192 chomp($image);
193 &util::rm("${directory}$image");
194 }
195 return 1;
196 }
197
198
199
200 for $image (@images) {
201 chomp($image);
202 my $cmd = "";
203 if ($ENV{'GSDLOS'} =~ /^windows/i) {
204 $cmd = "pnmtopng \"${directory}$image\"";
205 if (system($cmd)!=0) {
206 print STDERR "Error executing $cmd\n";
207 #return 1; # not sure about whether to leave this one in or take it out
208 next;
209 }
210 } else {
211 my @nameparts = split(/\./, $image);
212 my $image_base = shift(@nameparts);
213 $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
214 if (system($cmd)!=0) {
215 $cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
216 if (system($cmd)!=0) {
217 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
218 #return 1; # not sure about whether to leave this one in or take it out
219 next;
220 }
221 }
222 }
223 &util::rm($image);
224 }
225
226 return 0;
227}
228
229# indicate our error status, 0 = success
230exit (&main(@ARGV));
231
Note: See TracBrowser for help on using the repository browser.