Context Navigation

source: trunk/gsdl/bin/script/pdftohtml.pl@ 7586

Last change on this file since 7586 was 7586, checked in by kjdon, 20 years ago
if we remove the title cos it matches a filename, then we add in a meta tag with orig-title - this makes sure that the generated html files are not identical even when the pdfs have no text (if they are identical they all get the same hash id and end up overwriting each other in the archives dir
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	#!/usr/bin/perl -w
2
3
4	###########################################################################
5	#
6	# pdftohtml.pl -- convert PDF documents to HTML format
7	#
8	# A component of the Greenstone digital library software
9	# from the New Zealand Digital Library Project at the
10	# University of Waikato, New Zealand.
11	#
12	# Copyright (C) 2001 New Zealand Digital Library Project
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30	# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31	# PDF documents to HTML, and converts images to PNG format for display in
32	# the HTML pages generated
33
34	BEGIN {
35	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37	}
38
39	use parsargv;
40	use util;
41	use Cwd;
42	use File::Basename;
43
44	sub print_usage {
45	# note - we don't actually ever use most of these options...
46	print STDERR
47	("pdftohtml.pl wrapper for pdftohtml.\n",
48	"Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49	"Options:\n",
50	"\t-i\tignore images (don't extract)\n",
51	"\t-a\tallow images only (continue even if no text is present)\n",
52	"\t-c\tproduce complex output (requires ghostscript)\n",
53	"\t-hidden\tExtract hidden text\n",
54	"\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
55	);
56	exit (1);
57	}
58
59	sub main {
60	my (@ARGV) = @_;
61	my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
62
63	# read command-line arguments so that
64	# you can change the command in this script
65	if (!parsargv::parse(\@ARGV,
66	'a', \$allow_no_text,
67	'i', \$ignore_images,
68	'c', \$complex,
69	'hidden', \$hidden,
70	'zoom/\d+/2', \$zoom,
71	))
72	{
73	print_usage();
74	}
75
76	# Make sure the input file exists and can be opened for reading
77	if (scalar(@ARGV) != 2) {
78	print_usage();
79	}
80
81	my $input_filename = $ARGV[0];
82	my $output_filestem = $ARGV[1];
83
84	$output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
85
86	# test that the directories exist to create the output file, or
87	# we should exit immediately. (File:: is included by util.pm)
88	my $output_dir = File::Basename::dirname($output_filestem);
89	if (! -d $output_dir \|\| ! -w $output_dir) {
90	die "pdftohtml.pl: cannot write to directory $output_dir\n";
91	}
92
93	my @dir = split (/(\/\|\\)/, $input_filename);
94	my $input_basename = pop(@dir);
95	$input_basename =~ s/\.pdf//i;
96	my $dir = join ("", @dir);
97
98	if (!-r $input_filename) {
99	print STDERR "Error: unable to open $input_filename for reading\n";
100	exit(1);
101	}
102
103	# Heuristical code removed due to pdftohtml being "fixed" to not
104	# create bitmaps for each char in some pdfs. However, this means we
105	# now create .html files even if we can't extract any text. We should
106	# check for that now instead someday...
107
108
109	# formulate the command
110	my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
111
112	# don't include path on windows (to avoid having to play about
113	# with quoting when GSDLHOME might contain spaces) but assume
114	# that the PATH is set up correctly.
115	$cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
116
117	$cmd .= " -i" if ($ignore_images);
118	$cmd .= " -c" if ($complex);
119	$cmd .= " -hidden" if ($hidden);
120	$cmd .= " -zoom $zoom";
121	$cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
122
123	# system() returns -1 if it can't run, otherwise it's $cmds ret val.
124	# note we return 0 if the file is "encrypted"
125	$!=0;
126	if (system($cmd)!=0) {
127	print STDERR "pdftohtml error for $input_filename $!\n";
128	# leave these for gsConvert.pl...
129	#&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
130	#&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
131	return 1;
132	}
133
134	if (! -e "$output_filestem.html") {
135	return 1;
136	}
137
138	# post-process to remove </b><b> and </i><i>, as these break up
139	# words, screwing up indexing and searching.
140	# At the same time, check that our .html file has some textual content.
141	&util::mv("$output_filestem.html","$output_filestem.html.tmp");
142	$!=0;
143	open INFILE, "$output_filestem.html.tmp" \|\|
144	die "Couldn't open file: $!";
145	open OUTFILE, ">$output_filestem.html" \|\|
146	die "Couldn't open file for writing: $!";
147	my $line;
148	my $seen_textual_content=$allow_no_text;
149	# check for unicode byte-order marker at the start of the file
150	$line = <INFILE>;
151	$line =~ s#\376\377##g;
152	while ($line) {
153	$line =~ s#</b><b>##g;
154	$line =~ s#</i><i>##g;
155	$line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
156	# check for any extracted text
157	if ($seen_textual_content == 0) {
158	my $tmp_line=$line;
159	$tmp_line =~ s/<[^>]*>//g;
160	$tmp_line =~ s/Page\s\d+//;
161	$tmp_line =~ s/\s*//g;
162	if ($tmp_line ne "") {
163	$seen_textual_content=1;
164	}
165	# special - added to remove the filename from the title
166	# this should be in the header, before we see "textual content"
167	if ($line =~ m@<title>(.*?)</title>@) {
168	my $title=$1;
169
170	# is this title the name of a filename?
171	if (-r "$title.pdf" \|\| -r "$title.html") {
172	# remove the title
173	$line =~ s@<title>.*?</title>@<title></title><META NAME=\"Orig-title\" CONTENT=\"$title\">@;
174	}
175	}
176	}
177
178	# relative hrefs to own document...
179	$line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
180	# escape underscores, but not if they're inside tags (eg img/href names)
181	my $inatag = 0; # allow multi-line tags
182	if ($line =~ /_/) {
183	my @parts=split('_',$line);
184	my $lastpart=pop @parts;
185	foreach my $part (@parts) {
186	if ($part =~ /<[^>]*$/) { # if we're starting a tag...
187	$inatag=1;
188	} elsif ($part =~ />[^<]*$/) { # closing a tag
189	$inatag=0;
190	}
191	if ($inatag) {
192	$part.='_';
193	} else {
194	$part.="_";
195	}
196	}
197	$line=join('',@parts,$lastpart);
198	}
199
200	print OUTFILE $line;
201	$line = <INFILE>;
202	}
203	close INFILE;
204	close OUTFILE;
205	&util::rm("$output_filestem.html.tmp");
206
207	# Need to convert images from PPM format to PNG format
208	my @images;
209
210	my $directory=$output_filestem;
211	$directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
212	# newer versions of pdftohtml don't seem to do images this way anymore?
213	if (open (IMAGES, "${directory}images.log") \|\|
214	open (IMAGES, "${directory}image.log")) {
215	while (<IMAGES>) {
216	push (@images, $_);
217	}
218	close IMAGES;
219	&util::rm("${directory}image.log") if (-e "${directory}image.log");
220
221	}
222
223	# no need to go any further if there is no text extracted from pdf.
224	if ($seen_textual_content == 0) {
225	print STDERR "Error: PDF contains no extractable text\n";
226	# remove images...
227	for $image (@images) {
228	chomp($image);
229	&util::rm("${directory}$image");
230	}
231	return 1;
232	}
233
234
235
236	for $image (@images) {
237	chomp($image);
238	my $cmd = "";
239	if ($ENV{'GSDLOS'} =~ /^windows/i) {
240	$cmd = "pnmtopng \"${directory}$image\"";
241	if (system($cmd)!=0) {
242	print STDERR "Error executing $cmd\n";
243	#return 1; # not sure about whether to leave this one in or take it out
244	next;
245	}
246	} else {
247	my @nameparts = split(/\./, $image);
248	my $image_base = shift(@nameparts);
249	$cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
250	if (system($cmd)!=0) {
251	$cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
252	if (system($cmd)!=0) {
253	print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
254	#return 1; # not sure about whether to leave this one in or take it out
255	next;
256	}
257	}
258	}
259	&util::rm($image);
260	}
261
262	return 0;
263	}
264
265	# indicate our error status, 0 = success
266	exit (&main(@ARGV));
267

Note: See TracBrowser for help on using the repository browser.

Download in other formats: