Context Navigation

source: main/trunk/greenstone2/bin/script/pdftohtml.pl@ 22642

Last change on this file since 22642 was 7643, checked in by jrm21, 20 years ago
use case-insensitive match for <title> tags
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	#!/usr/bin/perl -w
2
3
4	###########################################################################
5	#
6	# pdftohtml.pl -- convert PDF documents to HTML format
7	#
8	# A component of the Greenstone digital library software
9	# from the New Zealand Digital Library Project at the
10	# University of Waikato, New Zealand.
11	#
12	# Copyright (C) 2001 New Zealand Digital Library Project
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30	# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31	# PDF documents to HTML, and converts images to PNG format for display in
32	# the HTML pages generated
33
34	BEGIN {
35	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37	}
38
39	use parsargv;
40	use util;
41	use Cwd;
42	use File::Basename;
43
44	sub print_usage {
45	# note - we don't actually ever use most of these options...
46	print STDERR
47	("pdftohtml.pl wrapper for pdftohtml.\n",
48	"Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49	"Options:\n",
50	"\t-i\tignore images (don't extract)\n",
51	"\t-a\tallow images only (continue even if no text is present)\n",
52	"\t-c\tproduce complex output (requires ghostscript)\n",
53	"\t-hidden\tExtract hidden text\n",
54	"\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
55	);
56	exit (1);
57	}
58
59	sub main {
60	my (@ARGV) = @_;
61	my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
62
63	# read command-line arguments so that
64	# you can change the command in this script
65	if (!parsargv::parse(\@ARGV,
66	'a', \$allow_no_text,
67	'i', \$ignore_images,
68	'c', \$complex,
69	'hidden', \$hidden,
70	'zoom/\d+/2', \$zoom,
71	))
72	{
73	print_usage();
74	}
75
76	# Make sure the input file exists and can be opened for reading
77	if (scalar(@ARGV) != 2) {
78	print_usage();
79	}
80
81	my $input_filename = $ARGV[0];
82	my $output_filestem = $ARGV[1];
83
84	$output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
85
86	# test that the directories exist to create the output file, or
87	# we should exit immediately. (File:: is included by util.pm)
88	my $output_dir = File::Basename::dirname($output_filestem);
89	if (! -d $output_dir \|\| ! -w $output_dir) {
90	die "pdftohtml.pl: cannot write to directory $output_dir\n";
91	}
92
93	my @dir = split (/(\/\|\\)/, $input_filename);
94	my $input_basename = pop(@dir);
95	$input_basename =~ s/\.pdf//i;
96	my $dir = join ("", @dir);
97
98	if (!-r $input_filename) {
99	print STDERR "Error: unable to open $input_filename for reading\n";
100	exit(1);
101	}
102
103	# Heuristical code removed due to pdftohtml being "fixed" to not
104	# create bitmaps for each char in some pdfs. However, this means we
105	# now create .html files even if we can't extract any text. We should
106	# check for that now instead someday...
107
108
109	# formulate the command
110	my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
111
112	# don't include path on windows (to avoid having to play about
113	# with quoting when GSDLHOME might contain spaces) but assume
114	# that the PATH is set up correctly.
115	$cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
116
117	$cmd .= " -i" if ($ignore_images);
118	$cmd .= " -c" if ($complex);
119	$cmd .= " -hidden" if ($hidden);
120	$cmd .= " -zoom $zoom";
121	$cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
122
123	# system() returns -1 if it can't run, otherwise it's $cmds ret val.
124	# note we return 0 if the file is "encrypted"
125	$!=0;
126	if (system($cmd)!=0) {
127	print STDERR "pdftohtml error for $input_filename $!\n";
128	# leave these for gsConvert.pl...
129	#&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
130	#&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
131	return 1;
132	}
133
134	if (! -e "$output_filestem.html") {
135	return 1;
136	}
137
138	# post-process to remove </b><b> and </i><i>, as these break up
139	# words, screwing up indexing and searching.
140	# At the same time, check that our .html file has some textual content.
141	&util::mv("$output_filestem.html","$output_filestem.html.tmp");
142	$!=0;
143	open INFILE, "$output_filestem.html.tmp" \|\|
144	die "Couldn't open file: $!";
145	open OUTFILE, ">$output_filestem.html" \|\|
146	die "Couldn't open file for writing: $!";
147	my $line;
148	my $seen_textual_content=$allow_no_text;
149	# check for unicode byte-order marker at the start of the file
150	$line = <INFILE>;
151	$line =~ s#\376\377##g;
152	while ($line) {
153	$line =~ s#</b><b>##g;
154	$line =~ s#</i><i>##g;
155	$line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
156	# check for any extracted text
157	if ($seen_textual_content == 0) {
158	my $tmp_line=$line;
159	$tmp_line =~ s/<[^>]*>//g;
160	$tmp_line =~ s/Page\s\d+//;
161	$tmp_line =~ s/\s*//g;
162	if ($tmp_line ne "") {
163	$seen_textual_content=1;
164	}
165	# special - added to remove the filename from the title
166	# this should be in the header, before we see "textual content"
167	if ($line =~ m@<title>(.*?)</title>@i) {
168	my $title=$1;
169
170	# is this title the name of a filename?
171	if (-r "$title.pdf" \|\| -r "$title.html") {
172	# remove the title
173	$line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
174	}
175	}
176	}
177
178	# relative hrefs to own document...
179	$line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
180	# escape underscores, but not if they're inside tags (eg img/href names)
181	my $inatag = 0; # allow multi-line tags
182	if ($line =~ /_/) {
183	my @parts=split('_',$line);
184	my $lastpart=pop @parts;
185	foreach my $part (@parts) {
186	if ($part =~ /<[^>]*$/) { # if we're starting a tag...
187	$inatag=1;
188	} elsif ($part =~ />[^<]*$/) { # closing a tag
189	$inatag=0;
190	}
191	if ($inatag) {
192	$part.='_';
193	} else {
194	$part.="_";
195	}
196	}
197	$line=join('',@parts,$lastpart);
198	}
199
200	print OUTFILE $line;
201	$line = <INFILE>;
202	}
203	close INFILE;
204	close OUTFILE;
205	&util::rm("$output_filestem.html.tmp");
206
207	# Need to convert images from PPM format to PNG format
208	my @images;
209
210	my $directory=$output_filestem;
211	$directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
212	# newer versions of pdftohtml don't seem to do images this way anymore?
213	if (open (IMAGES, "${directory}images.log") \|\|
214	open (IMAGES, "${directory}image.log")) {
215	while (<IMAGES>) {
216	push (@images, $_);
217	}
218	close IMAGES;
219	&util::rm("${directory}image.log") if (-e "${directory}image.log");
220
221	}
222
223	# no need to go any further if there is no text extracted from pdf.
224	if ($seen_textual_content == 0) {
225	print STDERR "Error: PDF contains no extractable text\n";
226	# remove images...
227	for $image (@images) {
228	chomp($image);
229	&util::rm("${directory}$image");
230	}
231	return 1;
232	}
233
234
235
236	for $image (@images) {
237	chomp($image);
238	my $cmd = "";
239	if ($ENV{'GSDLOS'} =~ /^windows/i) {
240	$cmd = "pnmtopng \"${directory}$image\"";
241	if (system($cmd)!=0) {
242	print STDERR "Error executing $cmd\n";
243	#return 1; # not sure about whether to leave this one in or take it out
244	next;
245	}
246	} else {
247	my @nameparts = split(/\./, $image);
248	my $image_base = shift(@nameparts);
249	$cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
250	if (system($cmd)!=0) {
251	$cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
252	if (system($cmd)!=0) {
253	print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
254	#return 1; # not sure about whether to leave this one in or take it out
255	next;
256	}
257	}
258	}
259	&util::rm($image);
260	}
261
262	return 0;
263	}
264
265	# indicate our error status, 0 = success
266	exit (&main(@ARGV));
267

Note: See TracBrowser for help on using the repository browser.

Download in other formats: