source: main/trunk/greenstone2/bin/script/pdftohtml.pl@ 22642

Last change on this file since 22642 was 7643, checked in by jrm21, 20 years ago

use case-insensitive match for <title> tags

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert PDF documents to HTML format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 2001 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45# note - we don't actually ever use most of these options...
46print STDERR
47 ("pdftohtml.pl wrapper for pdftohtml.\n",
48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
49 "Options:\n",
50 "\t-i\tignore images (don't extract)\n",
51 "\t-a\tallow images only (continue even if no text is present)\n",
52 "\t-c\tproduce complex output (requires ghostscript)\n",
53 "\t-hidden\tExtract hidden text\n",
54 "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
55 );
56exit (1);
57}
58
59sub main {
60 my (@ARGV) = @_;
61 my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
62
63 # read command-line arguments so that
64 # you can change the command in this script
65 if (!parsargv::parse(\@ARGV,
66 'a', \$allow_no_text,
67 'i', \$ignore_images,
68 'c', \$complex,
69 'hidden', \$hidden,
70 'zoom/\d+/2', \$zoom,
71 ))
72 {
73 print_usage();
74 }
75
76 # Make sure the input file exists and can be opened for reading
77 if (scalar(@ARGV) != 2) {
78 print_usage();
79 }
80
81 my $input_filename = $ARGV[0];
82 my $output_filestem = $ARGV[1];
83
84 $output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
85
86 # test that the directories exist to create the output file, or
87 # we should exit immediately. (File:: is included by util.pm)
88 my $output_dir = File::Basename::dirname($output_filestem);
89 if (! -d $output_dir || ! -w $output_dir) {
90 die "pdftohtml.pl: cannot write to directory $output_dir\n";
91 }
92
93 my @dir = split (/(\/|\\)/, $input_filename);
94 my $input_basename = pop(@dir);
95 $input_basename =~ s/\.pdf//i;
96 my $dir = join ("", @dir);
97
98 if (!-r $input_filename) {
99 print STDERR "Error: unable to open $input_filename for reading\n";
100 exit(1);
101 }
102
103 # Heuristical code removed due to pdftohtml being "fixed" to not
104 # create bitmaps for each char in some pdfs. However, this means we
105 # now create .html files even if we can't extract any text. We should
106 # check for that now instead someday...
107
108
109 # formulate the command
110 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
111
112 # don't include path on windows (to avoid having to play about
113 # with quoting when GSDLHOME might contain spaces) but assume
114 # that the PATH is set up correctly.
115 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
116
117 $cmd .= " -i" if ($ignore_images);
118 $cmd .= " -c" if ($complex);
119 $cmd .= " -hidden" if ($hidden);
120 $cmd .= " -zoom $zoom";
121 $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
122
123# system() returns -1 if it can't run, otherwise it's $cmds ret val.
124 # note we return 0 if the file is "encrypted"
125 $!=0;
126 if (system($cmd)!=0) {
127 print STDERR "pdftohtml error for $input_filename $!\n";
128 # leave these for gsConvert.pl...
129 #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
130 #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
131 return 1;
132 }
133
134 if (! -e "$output_filestem.html") {
135 return 1;
136 }
137
138# post-process to remove </b><b> and </i><i>, as these break up
139# words, screwing up indexing and searching.
140# At the same time, check that our .html file has some textual content.
141 &util::mv("$output_filestem.html","$output_filestem.html.tmp");
142 $!=0;
143 open INFILE, "$output_filestem.html.tmp" ||
144 die "Couldn't open file: $!";
145 open OUTFILE, ">$output_filestem.html" ||
146 die "Couldn't open file for writing: $!";
147 my $line;
148 my $seen_textual_content=$allow_no_text;
149 # check for unicode byte-order marker at the start of the file
150 $line = <INFILE>;
151 $line =~ s#\376\377##g;
152 while ($line) {
153 $line =~ s#</b><b>##g;
154 $line =~ s#</i><i>##g;
155 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
156# check for any extracted text
157 if ($seen_textual_content == 0) {
158 my $tmp_line=$line;
159 $tmp_line =~ s/<[^>]*>//g;
160 $tmp_line =~ s/Page\s\d+//;
161 $tmp_line =~ s/\s*//g;
162 if ($tmp_line ne "") {
163 $seen_textual_content=1;
164 }
165 # special - added to remove the filename from the title
166 # this should be in the header, before we see "textual content"
167 if ($line =~ m@<title>(.*?)</title>@i) {
168 my $title=$1;
169
170 # is this title the name of a filename?
171 if (-r "$title.pdf" || -r "$title.html") {
172 # remove the title
173 $line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
174 }
175 }
176 }
177
178 # relative hrefs to own document...
179 $line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
180# escape underscores, but not if they're inside tags (eg img/href names)
181 my $inatag = 0; # allow multi-line tags
182 if ($line =~ /_/) {
183 my @parts=split('_',$line);
184 my $lastpart=pop @parts;
185 foreach my $part (@parts) {
186 if ($part =~ /<[^>]*$/) { # if we're starting a tag...
187 $inatag=1;
188 } elsif ($part =~ />[^<]*$/) { # closing a tag
189 $inatag=0;
190 }
191 if ($inatag) {
192 $part.='_';
193 } else {
194 $part.="&#95;";
195 }
196 }
197 $line=join('',@parts,$lastpart);
198 }
199
200 print OUTFILE $line;
201 $line = <INFILE>;
202 }
203 close INFILE;
204 close OUTFILE;
205 &util::rm("$output_filestem.html.tmp");
206
207 # Need to convert images from PPM format to PNG format
208 my @images;
209
210 my $directory=$output_filestem;
211 $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
212 # newer versions of pdftohtml don't seem to do images this way anymore?
213 if (open (IMAGES, "${directory}images.log") ||
214 open (IMAGES, "${directory}image.log")) {
215 while (<IMAGES>) {
216 push (@images, $_);
217 }
218 close IMAGES;
219 &util::rm("${directory}image.log") if (-e "${directory}image.log");
220
221 }
222
223 # no need to go any further if there is no text extracted from pdf.
224 if ($seen_textual_content == 0) {
225 print STDERR "Error: PDF contains no extractable text\n";
226 # remove images...
227 for $image (@images) {
228 chomp($image);
229 &util::rm("${directory}$image");
230 }
231 return 1;
232 }
233
234
235
236 for $image (@images) {
237 chomp($image);
238 my $cmd = "";
239 if ($ENV{'GSDLOS'} =~ /^windows/i) {
240 $cmd = "pnmtopng \"${directory}$image\"";
241 if (system($cmd)!=0) {
242 print STDERR "Error executing $cmd\n";
243 #return 1; # not sure about whether to leave this one in or take it out
244 next;
245 }
246 } else {
247 my @nameparts = split(/\./, $image);
248 my $image_base = shift(@nameparts);
249 $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
250 if (system($cmd)!=0) {
251 $cmd = "convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
252 if (system($cmd)!=0) {
253 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
254 #return 1; # not sure about whether to leave this one in or take it out
255 next;
256 }
257 }
258 }
259 &util::rm($image);
260 }
261
262 return 0;
263}
264
265# indicate our error status, 0 = success
266exit (&main(@ARGV));
267
Note: See TracBrowser for help on using the repository browser.