source: trunk/gsdl/bin/script/pdftohtml.pl@ 2651

Last change on this file since 2651 was 2651, checked in by jrm21, 23 years ago

pnmtopng was failing due to spaces in filenames on windows. Also need to
replace \s with
s.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert documents to HTML or TEXT format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 1999 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45# note - we don't actually ever use most of these options...
46print STDERR
47 ("pdftohtml version 0.22 - modified for NZDL use\n",
48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49 " -f <int> : first page to convert\n",
50 " -l <int> : last page to convert\n",
51 " -d <dir> : target directory (default: basename of pdf-file)\n",
52 " -o <file> : name of output file; - means stdout (default index.html)\n",
53 " -q : don't print any messages or errors\n",
54 " -h : print this usage information\n",
55 " -p : exchange .pdf links by .html\n",
56# these options now have no effect in gs-custom pdftohtml
57# " -c : generate complex HTML document\n",
58# " -F : don't use frames in HTML document\n",
59 " -i : ignore images\n",
60 " -e <string> : set extension for images (in the Html-file) (default png)\n"
61 );
62exit (1);
63}
64
65sub main {
66 my (@ARGV) = @_;
67 my ($first,$last,$target_dir,$out_file,$img_ext,
68 $optq,$opth,$optp,$optF,$opti);
69
70 # read command-line arguments so that
71 # you can change the command in this script
72 if (!parsargv::parse(\@ARGV,
73 'f/\d+/1', \$first,
74 'l/\d+/1', \$last,
75 'd/[\S]*/', \$target_dir,
76 'o/[\S]*/', \$out_file,
77 'e/[\S]*/', \$img_ext,
78 'q', \$optq,
79 'h', \$opth,
80 'p', \$optp,
81# 'c', \$optc,
82 'F', \$optF,
83 'i', \$opti
84 ))
85 {
86 print_usage();
87 }
88
89 # Make sure the input file exists and can be opened for reading
90 if (scalar(@ARGV!=2)) {
91 print_usage();
92 }
93
94 my $input_filename = $ARGV[0];
95 my $output_filestem = $ARGV[1];
96 $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
97
98 my @dir = split (/(\/|\\)/, $input_filename);
99 pop(@dir);
100 my $dir = join ("", @dir);
101
102 if (!-r $input_filename) {
103 print STDERR "Error: unable to open $input_filename for reading\n";
104 exit(1);
105 }
106
107 # Heuristical code removed due to pdftohtml being "fixed" to not
108 # create bitmaps for each char in some pdfs. However, this means we
109 # now create .html files even if we can't extract any text. We should
110 # check for that now instead someday...
111
112
113 # formulate the command
114 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
115
116 # don't include path on windows (to avoid having to play about
117 # with quoting when GSDLHOME might contain spaces) but assume
118 # that the PATH is set up correctly.
119 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
120
121 if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
122 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
123 $cmd .= " > \"$output_filestem.out\"";
124
125 # attempting to redirect STDERR on windows 95/98 is a bad idea
126 $cmd .= " 2> \"$output_filestem.err\""
127 if $ENV{'GSDLOS'} !~ /^windows$/i;
128
129# system() returns -1 if it can't run, otherwise it's $cmds ret val.
130 if (system($cmd)!=0) {
131 print STDERR "Error executing $cmd: $!\n";
132 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
133 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
134 return 0;
135 }
136
137# post-process to remove </b><b> and </i><i>, as these break up
138# words, screwing up indexing and searching.
139 &util::mv("$output_filestem.html","$output_filestem.html.tmp");
140 open INFILE, "$output_filestem.html.tmp" ||
141 die "Couldn't open file: $!";
142 open OUTFILE, ">$output_filestem.html" ||
143 die "Couldn't open file for writing: $!";
144 my $line;
145 while ($line=<INFILE>) {
146 $line =~ s#</b><b>##g;
147 $line =~ s#</i><i>##g;
148 $line =~ s#\\#\\\\#g;
149 print OUTFILE $line;
150 }
151 close INFILE;
152 close OUTFILE;
153 &util::rm("$output_filestem.html.tmp");
154
155
156 # Need to convert images from PPM format to PNG format
157 my @images;
158
159
160 open (IMAGES, "images.log") ||
161 open (IMAGES, "image.log") ||
162 print STDERR "Error opening image log:$!\n";
163 while (<IMAGES>) {
164 push (@images, $_);
165 }
166 close IMAGES;
167
168 for $image (@images) {
169 chomp($image);
170 my $cmd = "";
171 if ($ENV{'GSDLOS'} =~ /^windows/i) {
172 $cmd = "pnmtopng \"$image\"";
173 if (system($cmd)!=0) {
174 print STDERR "Error executing $cmd\n";
175 #return 0; # not sure about whether to leave this one in or take it out
176 next;
177 }
178 } else {
179 my @nameparts = split(/\./, $image);
180 my $image_base = shift(@nameparts);
181
182 $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
183 if (system($cmd)!=0) {
184 $cmd = "convert $image $image_base.png 2>/dev/null";
185 if (system($cmd)!=0) {
186 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
187 #return 0; # not sure about whether to leave this one in or take it out
188 next;
189 }
190 }
191 }
192 &util::rm($image);
193 }
194
195 return 1;
196}
197
198# indicate our error status
199if (&main(@ARGV)) {exit 0;}
200exit 1;
Note: See TracBrowser for help on using the repository browser.