[1928] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 |
|
---|
| 4 | ###########################################################################
|
---|
| 5 | #
|
---|
[2367] | 6 | # pdftohtml.pl -- convert documents to HTML or TEXT format
|
---|
[1928] | 7 | #
|
---|
| 8 | # A component of the Greenstone digital library software
|
---|
| 9 | # from the New Zealand Digital Library Project at the
|
---|
| 10 | # University of Waikato, New Zealand.
|
---|
| 11 | #
|
---|
| 12 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 13 | #
|
---|
| 14 | # This program is free software; you can redistribute it and/or modify
|
---|
| 15 | # it under the terms of the GNU General Public License as published by
|
---|
| 16 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 17 | # (at your option) any later version.
|
---|
| 18 | #
|
---|
| 19 | # This program is distributed in the hope that it will be useful,
|
---|
| 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 22 | # GNU General Public License for more details.
|
---|
| 23 | #
|
---|
| 24 | # You should have received a copy of the GNU General Public License
|
---|
| 25 | # along with this program; if not, write to the Free Software
|
---|
| 26 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 27 | #
|
---|
| 28 | ###########################################################################
|
---|
| 29 |
|
---|
| 30 | # pdftohtml.pl is a wrapper for running pdftohtml utility which converts
|
---|
| 31 | # PDF documents to HTML, and converts images to PNG format for display in
|
---|
| 32 | # the HTML pages generated
|
---|
| 33 |
|
---|
| 34 | BEGIN {
|
---|
| 35 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 36 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 37 | }
|
---|
| 38 |
|
---|
| 39 | use parsargv;
|
---|
| 40 | use util;
|
---|
| 41 | use Cwd;
|
---|
| 42 | use File::Basename;
|
---|
| 43 |
|
---|
| 44 | sub print_usage {
|
---|
[2352] | 45 | # note - we don't actually ever use most of these options...
|
---|
[1928] | 46 | print STDERR
|
---|
[2352] | 47 | ("pdftohtml version 0.22 - modified for NZDL use\n",
|
---|
[2118] | 48 | "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
|
---|
[1928] | 49 | " -f <int> : first page to convert\n",
|
---|
| 50 | " -l <int> : last page to convert\n",
|
---|
| 51 | " -d <dir> : target directory (default: basename of pdf-file)\n",
|
---|
| 52 | " -o <file> : name of output file; - means stdout (default index.html)\n",
|
---|
| 53 | " -q : don't print any messages or errors\n",
|
---|
| 54 | " -h : print this usage information\n",
|
---|
| 55 | " -p : exchange .pdf links by .html\n",
|
---|
[2575] | 56 | # these options now have no effect in gs-custom pdftohtml
|
---|
[2352] | 57 | # " -c : generate complex HTML document\n",
|
---|
| 58 | # " -F : don't use frames in HTML document\n",
|
---|
[1928] | 59 | " -i : ignore images\n",
|
---|
| 60 | " -e <string> : set extension for images (in the Html-file) (default png)\n"
|
---|
| 61 | );
|
---|
[1984] | 62 | exit (1);
|
---|
[1928] | 63 | }
|
---|
| 64 |
|
---|
| 65 | sub main {
|
---|
| 66 | my (@ARGV) = @_;
|
---|
| 67 | my ($first,$last,$target_dir,$out_file,$img_ext,
|
---|
[2352] | 68 | $optq,$opth,$optp,$optF,$opti);
|
---|
[1928] | 69 |
|
---|
| 70 | # read command-line arguments so that
|
---|
| 71 | # you can change the command in this script
|
---|
| 72 | if (!parsargv::parse(\@ARGV,
|
---|
| 73 | 'f/\d+/1', \$first,
|
---|
| 74 | 'l/\d+/1', \$last,
|
---|
| 75 | 'd/[\S]*/', \$target_dir,
|
---|
| 76 | 'o/[\S]*/', \$out_file,
|
---|
| 77 | 'e/[\S]*/', \$img_ext,
|
---|
| 78 | 'q', \$optq,
|
---|
| 79 | 'h', \$opth,
|
---|
| 80 | 'p', \$optp,
|
---|
[2352] | 81 | # 'c', \$optc,
|
---|
[1928] | 82 | 'F', \$optF,
|
---|
| 83 | 'i', \$opti
|
---|
| 84 | ))
|
---|
| 85 | {
|
---|
| 86 | print_usage();
|
---|
| 87 | }
|
---|
| 88 |
|
---|
| 89 | # Make sure the input file exists and can be opened for reading
|
---|
| 90 | if (scalar(@ARGV!=2)) {
|
---|
| 91 | print_usage();
|
---|
| 92 | }
|
---|
| 93 |
|
---|
| 94 | my $input_filename = $ARGV[0];
|
---|
| 95 | my $output_filestem = $ARGV[1];
|
---|
[2118] | 96 | $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix
|
---|
[1928] | 97 |
|
---|
| 98 | my @dir = split (/(\/|\\)/, $input_filename);
|
---|
| 99 | pop(@dir);
|
---|
| 100 | my $dir = join ("", @dir);
|
---|
| 101 |
|
---|
| 102 | if (!-r $input_filename) {
|
---|
| 103 | print STDERR "Error: unable to open $input_filename for reading\n";
|
---|
| 104 | exit(1);
|
---|
| 105 | }
|
---|
| 106 |
|
---|
[2575] | 107 | # Heuristical code removed due to pdftohtml being "fixed" to not
|
---|
[2352] | 108 | # create bitmaps for each char in some pdfs. However, this means we
|
---|
| 109 | # now create .html files even if we can't extract any text. We should
|
---|
| 110 | # check for that now instead someday...
|
---|
[2118] | 111 |
|
---|
| 112 |
|
---|
[1928] | 113 | # formulate the command
|
---|
[2575] | 114 | my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
|
---|
[2241] | 115 |
|
---|
| 116 | # don't include path on windows (to avoid having to play about
|
---|
| 117 | # with quoting when GSDLHOME might contain spaces) but assume
|
---|
[2575] | 118 | # that the PATH is set up correctly.
|
---|
[2248] | 119 | $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
|
---|
[2241] | 120 |
|
---|
[2289] | 121 | if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
|
---|
[2241] | 122 | $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
|
---|
| 123 | $cmd .= " > \"$output_filestem.out\"";
|
---|
| 124 |
|
---|
| 125 | # attempting to redirect STDERR on windows 95/98 is a bad idea
|
---|
| 126 | $cmd .= " 2> \"$output_filestem.err\""
|
---|
| 127 | if $ENV{'GSDLOS'} !~ /^windows$/i;
|
---|
| 128 |
|
---|
[2289] | 129 | # system() returns -1 if it can't run, otherwise it's $cmds ret val.
|
---|
| 130 | if (system($cmd)!=0) {
|
---|
[1928] | 131 | print STDERR "Error executing $cmd: $!\n";
|
---|
| 132 | &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
|
---|
| 133 | &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
|
---|
| 134 | return 0;
|
---|
| 135 | }
|
---|
| 136 |
|
---|
[2599] | 137 | # post-process to remove </b><b> and </i><i>, as these break up
|
---|
| 138 | # words, screwing up indexing and searching.
|
---|
| 139 | &util::mv("$output_filestem.html","$output_filestem.html.tmp");
|
---|
| 140 | open INFILE, "$output_filestem.html.tmp" ||
|
---|
| 141 | die "Couldn't open file: $!";
|
---|
| 142 | open OUTFILE, ">$output_filestem.html" ||
|
---|
| 143 | die "Couldn't open file for writing: $!";
|
---|
| 144 | my $line;
|
---|
| 145 | while ($line=<INFILE>) {
|
---|
| 146 | $line =~ s#</b><b>##g;
|
---|
| 147 | $line =~ s#</i><i>##g;
|
---|
| 148 | print OUTFILE $line;
|
---|
| 149 | }
|
---|
| 150 | close INFILE;
|
---|
| 151 | close OUTFILE;
|
---|
| 152 | &util::rm("$output_filestem.html.tmp");
|
---|
| 153 |
|
---|
| 154 |
|
---|
[1928] | 155 | # Need to convert images from PPM format to PNG format
|
---|
| 156 | my @images;
|
---|
[2118] | 157 |
|
---|
[2241] | 158 |
|
---|
[2346] | 159 | open (IMAGES, "images.log") ||
|
---|
| 160 | open (IMAGES, "image.log") ||
|
---|
| 161 | print STDERR "Error opening image log:$!\n";
|
---|
[1928] | 162 | while (<IMAGES>) {
|
---|
| 163 | push (@images, $_);
|
---|
| 164 | }
|
---|
| 165 | close IMAGES;
|
---|
[2346] | 166 |
|
---|
[1928] | 167 | for $image (@images) {
|
---|
| 168 | chomp($image);
|
---|
| 169 | my $cmd = "";
|
---|
| 170 | if ($ENV{'GSDLOS'} =~ /^windows/i) {
|
---|
[2241] | 171 | $cmd = "pnmtopng $image";
|
---|
[2118] | 172 | if (system($cmd)!=0) {
|
---|
[1928] | 173 | print STDERR "Error executing $cmd\n";
|
---|
[2599] | 174 | #return 0; # not sure about whether to leave this one in or take it out
|
---|
| 175 | next;
|
---|
[1928] | 176 | }
|
---|
| 177 | } else {
|
---|
| 178 | my @nameparts = split(/\./, $image);
|
---|
| 179 | my $image_base = shift(@nameparts);
|
---|
| 180 |
|
---|
[2028] | 181 | $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
|
---|
[2118] | 182 | if (system($cmd)!=0) {
|
---|
[2028] | 183 | $cmd = "convert $image $image_base.png 2>/dev/null";
|
---|
[2118] | 184 | if (system($cmd)!=0) {
|
---|
[2028] | 185 | print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
|
---|
[2599] | 186 | #return 0; # not sure about whether to leave this one in or take it out
|
---|
| 187 | next;
|
---|
[1928] | 188 | }
|
---|
| 189 | }
|
---|
| 190 | }
|
---|
| 191 | &util::rm($image);
|
---|
| 192 | }
|
---|
| 193 |
|
---|
| 194 | return 1;
|
---|
| 195 | }
|
---|
| 196 |
|
---|
[2599] | 197 | # indicate our error status
|
---|
| 198 | if (&main(@ARGV)) {exit 0;}
|
---|
| 199 | exit 1;
|
---|