Context Navigation

source: trunk/gsdl/bin/script/pdftohtml.pl@ 2977

Last change on this file since 2977 was 2976, checked in by jrm21, 22 years ago
minor change to test of @ARGV - scalar had brackets in wrong place.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Line
1	#!/usr/bin/perl -w
2
3
4	###########################################################################
5	#
6	# pdftohtml.pl -- convert PDF documents to HTML format
7	#
8	# A component of the Greenstone digital library software
9	# from the New Zealand Digital Library Project at the
10	# University of Waikato, New Zealand.
11	#
12	# Copyright (C) 2001 New Zealand Digital Library Project
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30	# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31	# PDF documents to HTML, and converts images to PNG format for display in
32	# the HTML pages generated
33
34	BEGIN {
35	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37	}
38
39	use parsargv;
40	use util;
41	use Cwd;
42	use File::Basename;
43
44	sub print_usage {
45	# note - we don't actually ever use most of these options...
46	print STDERR
47	("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",
48	"Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49	"Options:\n",
50	"\t-i\tignore images (don't extract)\n",
51	"\t-a\tallow images only (continue even if no text is present)\n"
52	);
53	exit (1);
54	}
55
56	sub main {
57	my (@ARGV) = @_;
58	my ($allow_no_text,$ignore_images);
59
60	# read command-line arguments so that
61	# you can change the command in this script
62	if (!parsargv::parse(\@ARGV,
63	'a', \$allow_no_text,
64	'i', \$ignore_images
65	))
66	{
67	print_usage();
68	}
69
70	# Make sure the input file exists and can be opened for reading
71	if (scalar(@ARGV) != 2) {
72	print_usage();
73	}
74
75	my $input_filename = $ARGV[0];
76	my $output_filestem = $ARGV[1];
77	$output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
78
79	my @dir = split (/(\/\|\\)/, $input_filename);
80	pop(@dir);
81	my $dir = join ("", @dir);
82
83	if (!-r $input_filename) {
84	print STDERR "Error: unable to open $input_filename for reading\n";
85	exit(1);
86	}
87
88	# Heuristical code removed due to pdftohtml being "fixed" to not
89	# create bitmaps for each char in some pdfs. However, this means we
90	# now create .html files even if we can't extract any text. We should
91	# check for that now instead someday...
92
93
94	# formulate the command
95	my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
96
97	# don't include path on windows (to avoid having to play about
98	# with quoting when GSDLHOME might contain spaces) but assume
99	# that the PATH is set up correctly.
100	$cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
101
102	$cmd .= " -i" if ($ignore_images);
103	$cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
104
105	# system() returns -1 if it can't run, otherwise it's $cmds ret val.
106	# note we return 0 if the file is "encrypted"
107	$!=0;
108	if (system($cmd)!=0) {
109	print STDERR "pdftohtml error for $input_filename $!\n";
110	# leave these for gsConvert.pl...
111	#&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
112	#&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
113	return 1;
114	}
115
116	if (! -e "$output_filestem.html") {
117	return 1;
118	}
119
120	# post-process to remove </b><b> and </i><i>, as these break up
121	# words, screwing up indexing and searching.
122	# At the same time, check that our .html file has some textual content.
123	&util::mv("$output_filestem.html","$output_filestem.html.tmp");
124	$!=0;
125	open INFILE, "$output_filestem.html.tmp" \|\|
126	die "Couldn't open file: $!";
127	open OUTFILE, ">$output_filestem.html" \|\|
128	die "Couldn't open file for writing: $!";
129	my $line;
130	my $seen_textual_content=$allow_no_text;
131	while ($line=<INFILE>) {
132	$line =~ s#</b><b>##g;
133	$line =~ s#</i><i>##g;
134	$line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
135	# check for any extracted text
136	if ($seen_textual_content == 0) {
137	my $tmp_line=$line;
138	$tmp_line =~ s/<[^>]*>//g;
139	$tmp_line =~ s/Page\s\d+//;
140	$tmp_line =~ s/\s*//g;
141	if ($tmp_line ne "") {
142	$seen_textual_content=1;
143	}
144	}
145
146	# escape underscores, but not if they're inside tags (eg img/href names)
147	my $inatag = 0; # allow multi-line tags
148	if ($line =~ /_/) {
149	my @parts=split('_',$line);
150	my $lastpart=pop @parts;
151	foreach my $part (@parts) {
152	if ($part =~ /<[^>]*$/) { # if we're starting a tag...
153	$inatag=1;
154	} elsif ($part =~ />[^<]*$/) { # closing a tag
155	$inatag=0;
156	}
157	if ($inatag) {
158	$part.='_';
159	} else {
160	$part.="_";
161	}
162	}
163	$line=join('',@parts,$lastpart);
164	}
165
166	print OUTFILE $line;
167	}
168	close INFILE;
169	close OUTFILE;
170	&util::rm("$output_filestem.html.tmp");
171
172	# Need to convert images from PPM format to PNG format
173	my @images;
174
175	my $directory=$output_filestem;
176	$directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
177
178	open (IMAGES, "${directory}images.log") \|\|
179	open (IMAGES, "${directory}image.log") \|\|
180	print STDERR "Error opening image log:$!\n";
181	while (<IMAGES>) {
182	push (@images, $_);
183	}
184	close IMAGES;
185	&util::rm("${directory}image.log") if (-e "${directory}image.log");
186
187	# no need to go any further if there is no text extracted from pdf.
188	if ($seen_textual_content == 0) {
189	print STDERR "Error: PDF contains no extractable text\n";
190	# remove images...
191	for $image (@images) {
192	chomp($image);
193	&util::rm("${directory}$image");
194	}
195	return 1;
196	}
197
198
199
200	for $image (@images) {
201	chomp($image);
202	my $cmd = "";
203	if ($ENV{'GSDLOS'} =~ /^windows/i) {
204	$cmd = "pnmtopng \"${directory}$image\"";
205	if (system($cmd)!=0) {
206	print STDERR "Error executing $cmd\n";
207	#return 1; # not sure about whether to leave this one in or take it out
208	next;
209	}
210	} else {
211	my @nameparts = split(/\./, $image);
212	my $image_base = shift(@nameparts);
213	$cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
214	if (system($cmd)!=0) {
215	$cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
216	if (system($cmd)!=0) {
217	print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
218	#return 1; # not sure about whether to leave this one in or take it out
219	next;
220	}
221	}
222	}
223	&util::rm($image);
224	}
225
226	return 0;
227	}
228
229	# indicate our error status, 0 = success
230	exit (&main(@ARGV));
231

Note: See TracBrowser for help on using the repository browser.

Download in other formats: