source: trunk/gsdl/bin/script/pdftohtml.pl@ 2599

Last change on this file since 2599 was 2599, checked in by jrm21, 23 years ago

we now do some post-processing to fix up words broken by html tags - eg
<b>Wo</b><b>rds</b> would go to mg as "Wo" and "rds", not "Words". It also
makes the html cleaner. We currently only do this for <b> and <i> tags.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert documents to HTML or TEXT format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 1999 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45# note - we don't actually ever use most of these options...
46print STDERR
47 ("pdftohtml version 0.22 - modified for NZDL use\n",
48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49 " -f <int> : first page to convert\n",
50 " -l <int> : last page to convert\n",
51 " -d <dir> : target directory (default: basename of pdf-file)\n",
52 " -o <file> : name of output file; - means stdout (default index.html)\n",
53 " -q : don't print any messages or errors\n",
54 " -h : print this usage information\n",
55 " -p : exchange .pdf links by .html\n",
56# these options now have no effect in gs-custom pdftohtml
57# " -c : generate complex HTML document\n",
58# " -F : don't use frames in HTML document\n",
59 " -i : ignore images\n",
60 " -e <string> : set extension for images (in the Html-file) (default png)\n"
61 );
62exit (1);
63}
64
65sub main {
66 my (@ARGV) = @_;
67 my ($first,$last,$target_dir,$out_file,$img_ext,
68 $optq,$opth,$optp,$optF,$opti);
69
70 # read command-line arguments so that
71 # you can change the command in this script
72 if (!parsargv::parse(\@ARGV,
73 'f/\d+/1', \$first,
74 'l/\d+/1', \$last,
75 'd/[\S]*/', \$target_dir,
76 'o/[\S]*/', \$out_file,
77 'e/[\S]*/', \$img_ext,
78 'q', \$optq,
79 'h', \$opth,
80 'p', \$optp,
81# 'c', \$optc,
82 'F', \$optF,
83 'i', \$opti
84 ))
85 {
86 print_usage();
87 }
88
89 # Make sure the input file exists and can be opened for reading
90 if (scalar(@ARGV!=2)) {
91 print_usage();
92 }
93
94 my $input_filename = $ARGV[0];
95 my $output_filestem = $ARGV[1];
96 $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
97
98 my @dir = split (/(\/|\\)/, $input_filename);
99 pop(@dir);
100 my $dir = join ("", @dir);
101
102 if (!-r $input_filename) {
103 print STDERR "Error: unable to open $input_filename for reading\n";
104 exit(1);
105 }
106
107 # Heuristical code removed due to pdftohtml being "fixed" to not
108 # create bitmaps for each char in some pdfs. However, this means we
109 # now create .html files even if we can't extract any text. We should
110 # check for that now instead someday...
111
112
113 # formulate the command
114 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
115
116 # don't include path on windows (to avoid having to play about
117 # with quoting when GSDLHOME might contain spaces) but assume
118 # that the PATH is set up correctly.
119 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
120
121 if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
122 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
123 $cmd .= " > \"$output_filestem.out\"";
124
125 # attempting to redirect STDERR on windows 95/98 is a bad idea
126 $cmd .= " 2> \"$output_filestem.err\""
127 if $ENV{'GSDLOS'} !~ /^windows$/i;
128
129# system() returns -1 if it can't run, otherwise it's $cmds ret val.
130 if (system($cmd)!=0) {
131 print STDERR "Error executing $cmd: $!\n";
132 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
133 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
134 return 0;
135 }
136
137# post-process to remove </b><b> and </i><i>, as these break up
138# words, screwing up indexing and searching.
139 &util::mv("$output_filestem.html","$output_filestem.html.tmp");
140 open INFILE, "$output_filestem.html.tmp" ||
141 die "Couldn't open file: $!";
142 open OUTFILE, ">$output_filestem.html" ||
143 die "Couldn't open file for writing: $!";
144 my $line;
145 while ($line=<INFILE>) {
146 $line =~ s#</b><b>##g;
147 $line =~ s#</i><i>##g;
148 print OUTFILE $line;
149 }
150 close INFILE;
151 close OUTFILE;
152 &util::rm("$output_filestem.html.tmp");
153
154
155 # Need to convert images from PPM format to PNG format
156 my @images;
157
158
159 open (IMAGES, "images.log") ||
160 open (IMAGES, "image.log") ||
161 print STDERR "Error opening image log:$!\n";
162 while (<IMAGES>) {
163 push (@images, $_);
164 }
165 close IMAGES;
166
167 for $image (@images) {
168 chomp($image);
169 my $cmd = "";
170 if ($ENV{'GSDLOS'} =~ /^windows/i) {
171 $cmd = "pnmtopng $image";
172 if (system($cmd)!=0) {
173 print STDERR "Error executing $cmd\n";
174 #return 0; # not sure about whether to leave this one in or take it out
175 next;
176 }
177 } else {
178 my @nameparts = split(/\./, $image);
179 my $image_base = shift(@nameparts);
180
181 $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
182 if (system($cmd)!=0) {
183 $cmd = "convert $image $image_base.png 2>/dev/null";
184 if (system($cmd)!=0) {
185 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
186 #return 0; # not sure about whether to leave this one in or take it out
187 next;
188 }
189 }
190 }
191 &util::rm($image);
192 }
193
194 return 1;
195}
196
197# indicate our error status
198if (&main(@ARGV)) {exit 0;}
199exit 1;
Note: See TracBrowser for help on using the repository browser.