source: main/trunk/greenstone2/bin/script/pdftohtml.pl@ 31751

Last change on this file since 31751 was 27757, checked in by ak19, 11 years ago

Using FileUtils subroutines instead of deprecated calls to util package

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
RevLine 
[1928]1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
[2715]6# pdftohtml.pl -- convert PDF documents to HTML format
[1928]7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
[2715]12# Copyright (C) 2001 New Zealand Digital Library Project
[1928]13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
[27757]41use FileUtils;
[1928]42use Cwd;
43use File::Basename;
44
45sub print_usage {
[2352]46# note - we don't actually ever use most of these options...
[1928]47print STDERR
[3720]48 ("pdftohtml.pl wrapper for pdftohtml.\n",
[2118]49 "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
[2755]50 "Options:\n",
51 "\t-i\tignore images (don't extract)\n",
[3720]52 "\t-a\tallow images only (continue even if no text is present)\n",
53 "\t-c\tproduce complex output (requires ghostscript)\n",
[4103]54 "\t-hidden\tExtract hidden text\n",
[3720]55 "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
[1928]56 );
[1984]57exit (1);
[1928]58}
59
60sub main {
61 my (@ARGV) = @_;
[4103]62 my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
[1928]63
64 # read command-line arguments so that
65 # you can change the command in this script
66 if (!parsargv::parse(\@ARGV,
[2755]67 'a', \$allow_no_text,
[3720]68 'i', \$ignore_images,
69 'c', \$complex,
[4103]70 'hidden', \$hidden,
[3720]71 'zoom/\d+/2', \$zoom,
[1928]72 ))
73 {
74 print_usage();
75 }
76
77 # Make sure the input file exists and can be opened for reading
[2976]78 if (scalar(@ARGV) != 2) {
[1928]79 print_usage();
80 }
81
82 my $input_filename = $ARGV[0];
83 my $output_filestem = $ARGV[1];
[3522]84
[3410]85 $output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
[1928]86
[3522]87 # test that the directories exist to create the output file, or
88 # we should exit immediately. (File:: is included by util.pm)
89 my $output_dir = File::Basename::dirname($output_filestem);
90 if (! -d $output_dir || ! -w $output_dir) {
91 die "pdftohtml.pl: cannot write to directory $output_dir\n";
92 }
93
[1928]94 my @dir = split (/(\/|\\)/, $input_filename);
[3410]95 my $input_basename = pop(@dir);
96 $input_basename =~ s/\.pdf//i;
[1928]97 my $dir = join ("", @dir);
98
99 if (!-r $input_filename) {
100 print STDERR "Error: unable to open $input_filename for reading\n";
101 exit(1);
102 }
103
[2575]104 # Heuristical code removed due to pdftohtml being "fixed" to not
[2352]105 # create bitmaps for each char in some pdfs. However, this means we
106 # now create .html files even if we can't extract any text. We should
107 # check for that now instead someday...
[2118]108
109
[1928]110 # formulate the command
[27757]111 my $cmd = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
[2241]112
113 # don't include path on windows (to avoid having to play about
114 # with quoting when GSDLHOME might contain spaces) but assume
[2575]115 # that the PATH is set up correctly.
[2248]116 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
[2241]117
[2755]118 $cmd .= " -i" if ($ignore_images);
[3720]119 $cmd .= " -c" if ($complex);
[4103]120 $cmd .= " -hidden" if ($hidden);
[3720]121 $cmd .= " -zoom $zoom";
122 $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
[2241]123
[2289]124# system() returns -1 if it can't run, otherwise it's $cmds ret val.
[2655]125 # note we return 0 if the file is "encrypted"
[2755]126 $!=0;
[2289]127 if (system($cmd)!=0) {
[2755]128 print STDERR "pdftohtml error for $input_filename $!\n";
[2655]129 # leave these for gsConvert.pl...
[27757]130 #&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
131 #&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[2755]132 return 1;
[1928]133 }
134
[2655]135 if (! -e "$output_filestem.html") {
[2755]136 return 1;
[2655]137 }
138
[2599]139# post-process to remove </b><b> and </i><i>, as these break up
140# words, screwing up indexing and searching.
[2755]141# At the same time, check that our .html file has some textual content.
[27757]142 &FileUtils::moveFiles("$output_filestem.html","$output_filestem.html.tmp");
[2755]143 $!=0;
[2599]144 open INFILE, "$output_filestem.html.tmp" ||
145 die "Couldn't open file: $!";
146 open OUTFILE, ">$output_filestem.html" ||
147 die "Couldn't open file for writing: $!";
148 my $line;
[2755]149 my $seen_textual_content=$allow_no_text;
[7018]150 # check for unicode byte-order marker at the start of the file
151 $line = <INFILE>;
152 $line =~ s#\376\377##g;
153 while ($line) {
[2599]154 $line =~ s#</b><b>##g;
155 $line =~ s#</i><i>##g;
[2715]156 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
[2755]157# check for any extracted text
158 if ($seen_textual_content == 0) {
159 my $tmp_line=$line;
160 $tmp_line =~ s/<[^>]*>//g;
161 $tmp_line =~ s/Page\s\d+//;
162 $tmp_line =~ s/\s*//g;
163 if ($tmp_line ne "") {
164 $seen_textual_content=1;
165 }
[7120]166 # special - added to remove the filename from the title
167 # this should be in the header, before we see "textual content"
[7643]168 if ($line =~ m@<title>(.*?)</title>@i) {
[7120]169 my $title=$1;
[7586]170
[7120]171 # is this title the name of a filename?
172 if (-r "$title.pdf" || -r "$title.html") {
173 # remove the title
[7643]174 $line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
[7120]175 }
176 }
[2755]177 }
178
[3410]179 # relative hrefs to own document...
180 $line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
[2715]181# escape underscores, but not if they're inside tags (eg img/href names)
182 my $inatag = 0; # allow multi-line tags
183 if ($line =~ /_/) {
184 my @parts=split('_',$line);
185 my $lastpart=pop @parts;
186 foreach my $part (@parts) {
187 if ($part =~ /<[^>]*$/) { # if we're starting a tag...
188 $inatag=1;
189 } elsif ($part =~ />[^<]*$/) { # closing a tag
190 $inatag=0;
191 }
192 if ($inatag) {
193 $part.='_';
194 } else {
195 $part.="&#95;";
196 }
197 }
198 $line=join('',@parts,$lastpart);
199 }
200
[2599]201 print OUTFILE $line;
[7018]202 $line = <INFILE>;
[2599]203 }
204 close INFILE;
205 close OUTFILE;
[27757]206 &FileUtils::removeFiles("$output_filestem.html.tmp");
[2599]207
[1928]208 # Need to convert images from PPM format to PNG format
209 my @images;
[2118]210
[2743]211 my $directory=$output_filestem;
212 $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
[7120]213 # newer versions of pdftohtml don't seem to do images this way anymore?
[3720]214 if (open (IMAGES, "${directory}images.log") ||
215 open (IMAGES, "${directory}image.log")) {
216 while (<IMAGES>) {
217 push (@images, $_);
218 }
219 close IMAGES;
[27757]220 &FileUtils::removeFiles("${directory}image.log") if (-e "${directory}image.log");
[3720]221
[1928]222 }
[2346]223
[2755]224 # no need to go any further if there is no text extracted from pdf.
225 if ($seen_textual_content == 0) {
226 print STDERR "Error: PDF contains no extractable text\n";
227 # remove images...
228 for $image (@images) {
229 chomp($image);
[27757]230 &FileUtils::removeFiles("${directory}$image");
[2755]231 }
232 return 1;
233 }
234
235
236
[1928]237 for $image (@images) {
238 chomp($image);
239 my $cmd = "";
240 if ($ENV{'GSDLOS'} =~ /^windows/i) {
[2743]241 $cmd = "pnmtopng \"${directory}$image\"";
[2118]242 if (system($cmd)!=0) {
[1928]243 print STDERR "Error executing $cmd\n";
[2755]244 #return 1; # not sure about whether to leave this one in or take it out
[2599]245 next;
[1928]246 }
247 } else {
248 my @nameparts = split(/\./, $image);
249 my $image_base = shift(@nameparts);
[2930]250 $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
[2118]251 if (system($cmd)!=0) {
[24600]252 $cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
[2118]253 if (system($cmd)!=0) {
[2028]254 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
[2755]255 #return 1; # not sure about whether to leave this one in or take it out
[2599]256 next;
[1928]257 }
258 }
259 }
[27757]260 &FileUtils::removeFiles($image);
[1928]261 }
262
[2755]263 return 0;
[1928]264}
265
[2755]266# indicate our error status, 0 = success
267exit (&main(@ARGV));
268
Note: See TracBrowser for help on using the repository browser.