[1928] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 |
|
---|
| 4 | ###########################################################################
|
---|
| 5 | #
|
---|
[2715] | 6 | # pdftohtml.pl -- convert PDF documents to HTML format
|
---|
[1928] | 7 | #
|
---|
| 8 | # A component of the Greenstone digital library software
|
---|
| 9 | # from the New Zealand Digital Library Project at the
|
---|
| 10 | # University of Waikato, New Zealand.
|
---|
| 11 | #
|
---|
[2715] | 12 | # Copyright (C) 2001 New Zealand Digital Library Project
|
---|
[1928] | 13 | #
|
---|
| 14 | # This program is free software; you can redistribute it and/or modify
|
---|
| 15 | # it under the terms of the GNU General Public License as published by
|
---|
| 16 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 17 | # (at your option) any later version.
|
---|
| 18 | #
|
---|
| 19 | # This program is distributed in the hope that it will be useful,
|
---|
| 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 22 | # GNU General Public License for more details.
|
---|
| 23 | #
|
---|
| 24 | # You should have received a copy of the GNU General Public License
|
---|
| 25 | # along with this program; if not, write to the Free Software
|
---|
| 26 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 27 | #
|
---|
| 28 | ###########################################################################
|
---|
| 29 |
|
---|
| 30 | # pdftohtml.pl is a wrapper for running pdftohtml utility which converts
|
---|
| 31 | # PDF documents to HTML, and converts images to PNG format for display in
|
---|
| 32 | # the HTML pages generated
|
---|
| 33 |
|
---|
| 34 | BEGIN {
|
---|
| 35 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 36 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 37 | }
|
---|
| 38 |
|
---|
| 39 | use parsargv;
|
---|
| 40 | use util;
|
---|
[27757] | 41 | use FileUtils;
|
---|
[1928] | 42 | use Cwd;
|
---|
| 43 | use File::Basename;
|
---|
| 44 |
|
---|
| 45 | sub print_usage {
|
---|
[2352] | 46 | # note - we don't actually ever use most of these options...
|
---|
[1928] | 47 | print STDERR
|
---|
[3720] | 48 | ("pdftohtml.pl wrapper for pdftohtml.\n",
|
---|
[2118] | 49 | "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
|
---|
[2755] | 50 | "Options:\n",
|
---|
| 51 | "\t-i\tignore images (don't extract)\n",
|
---|
[3720] | 52 | "\t-a\tallow images only (continue even if no text is present)\n",
|
---|
| 53 | "\t-c\tproduce complex output (requires ghostscript)\n",
|
---|
[4103] | 54 | "\t-hidden\tExtract hidden text\n",
|
---|
[3720] | 55 | "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
|
---|
[1928] | 56 | );
|
---|
[1984] | 57 | exit (1);
|
---|
[1928] | 58 | }
|
---|
| 59 |
|
---|
| 60 | sub main {
|
---|
| 61 | my (@ARGV) = @_;
|
---|
[4103] | 62 | my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
|
---|
[1928] | 63 |
|
---|
| 64 | # read command-line arguments so that
|
---|
| 65 | # you can change the command in this script
|
---|
| 66 | if (!parsargv::parse(\@ARGV,
|
---|
[2755] | 67 | 'a', \$allow_no_text,
|
---|
[3720] | 68 | 'i', \$ignore_images,
|
---|
| 69 | 'c', \$complex,
|
---|
[4103] | 70 | 'hidden', \$hidden,
|
---|
[3720] | 71 | 'zoom/\d+/2', \$zoom,
|
---|
[1928] | 72 | ))
|
---|
| 73 | {
|
---|
| 74 | print_usage();
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | # Make sure the input file exists and can be opened for reading
|
---|
[2976] | 78 | if (scalar(@ARGV) != 2) {
|
---|
[1928] | 79 | print_usage();
|
---|
| 80 | }
|
---|
| 81 |
|
---|
| 82 | my $input_filename = $ARGV[0];
|
---|
| 83 | my $output_filestem = $ARGV[1];
|
---|
[3522] | 84 |
|
---|
[3410] | 85 | $output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
|
---|
[1928] | 86 |
|
---|
[3522] | 87 | # test that the directories exist to create the output file, or
|
---|
| 88 | # we should exit immediately. (File:: is included by util.pm)
|
---|
| 89 | my $output_dir = File::Basename::dirname($output_filestem);
|
---|
| 90 | if (! -d $output_dir || ! -w $output_dir) {
|
---|
| 91 | die "pdftohtml.pl: cannot write to directory $output_dir\n";
|
---|
| 92 | }
|
---|
| 93 |
|
---|
[1928] | 94 | my @dir = split (/(\/|\\)/, $input_filename);
|
---|
[3410] | 95 | my $input_basename = pop(@dir);
|
---|
| 96 | $input_basename =~ s/\.pdf//i;
|
---|
[1928] | 97 | my $dir = join ("", @dir);
|
---|
| 98 |
|
---|
| 99 | if (!-r $input_filename) {
|
---|
| 100 | print STDERR "Error: unable to open $input_filename for reading\n";
|
---|
| 101 | exit(1);
|
---|
| 102 | }
|
---|
| 103 |
|
---|
[2575] | 104 | # Heuristical code removed due to pdftohtml being "fixed" to not
|
---|
[2352] | 105 | # create bitmaps for each char in some pdfs. However, this means we
|
---|
| 106 | # now create .html files even if we can't extract any text. We should
|
---|
| 107 | # check for that now instead someday...
|
---|
[2118] | 108 |
|
---|
| 109 |
|
---|
[1928] | 110 | # formulate the command
|
---|
[27757] | 111 | my $cmd = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
|
---|
[2241] | 112 |
|
---|
| 113 | # don't include path on windows (to avoid having to play about
|
---|
| 114 | # with quoting when GSDLHOME might contain spaces) but assume
|
---|
[2575] | 115 | # that the PATH is set up correctly.
|
---|
[2248] | 116 | $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
|
---|
[2241] | 117 |
|
---|
[2755] | 118 | $cmd .= " -i" if ($ignore_images);
|
---|
[3720] | 119 | $cmd .= " -c" if ($complex);
|
---|
[4103] | 120 | $cmd .= " -hidden" if ($hidden);
|
---|
[3720] | 121 | $cmd .= " -zoom $zoom";
|
---|
| 122 | $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
|
---|
[2241] | 123 |
|
---|
[2289] | 124 | # system() returns -1 if it can't run, otherwise it's $cmds ret val.
|
---|
[2655] | 125 | # note we return 0 if the file is "encrypted"
|
---|
[2755] | 126 | $!=0;
|
---|
[2289] | 127 | if (system($cmd)!=0) {
|
---|
[2755] | 128 | print STDERR "pdftohtml error for $input_filename $!\n";
|
---|
[2655] | 129 | # leave these for gsConvert.pl...
|
---|
[27757] | 130 | #&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
|
---|
| 131 | #&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
|
---|
[2755] | 132 | return 1;
|
---|
[1928] | 133 | }
|
---|
| 134 |
|
---|
[2655] | 135 | if (! -e "$output_filestem.html") {
|
---|
[2755] | 136 | return 1;
|
---|
[2655] | 137 | }
|
---|
| 138 |
|
---|
[2599] | 139 | # post-process to remove </b><b> and </i><i>, as these break up
|
---|
| 140 | # words, screwing up indexing and searching.
|
---|
[2755] | 141 | # At the same time, check that our .html file has some textual content.
|
---|
[27757] | 142 | &FileUtils::moveFiles("$output_filestem.html","$output_filestem.html.tmp");
|
---|
[2755] | 143 | $!=0;
|
---|
[2599] | 144 | open INFILE, "$output_filestem.html.tmp" ||
|
---|
| 145 | die "Couldn't open file: $!";
|
---|
| 146 | open OUTFILE, ">$output_filestem.html" ||
|
---|
| 147 | die "Couldn't open file for writing: $!";
|
---|
| 148 | my $line;
|
---|
[2755] | 149 | my $seen_textual_content=$allow_no_text;
|
---|
[7018] | 150 | # check for unicode byte-order marker at the start of the file
|
---|
| 151 | $line = <INFILE>;
|
---|
| 152 | $line =~ s#\376\377##g;
|
---|
| 153 | while ($line) {
|
---|
[2599] | 154 | $line =~ s#</b><b>##g;
|
---|
| 155 | $line =~ s#</i><i>##g;
|
---|
[2715] | 156 | $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
|
---|
[2755] | 157 | # check for any extracted text
|
---|
| 158 | if ($seen_textual_content == 0) {
|
---|
| 159 | my $tmp_line=$line;
|
---|
| 160 | $tmp_line =~ s/<[^>]*>//g;
|
---|
| 161 | $tmp_line =~ s/Page\s\d+//;
|
---|
| 162 | $tmp_line =~ s/\s*//g;
|
---|
| 163 | if ($tmp_line ne "") {
|
---|
| 164 | $seen_textual_content=1;
|
---|
| 165 | }
|
---|
[7120] | 166 | # special - added to remove the filename from the title
|
---|
| 167 | # this should be in the header, before we see "textual content"
|
---|
[7643] | 168 | if ($line =~ m@<title>(.*?)</title>@i) {
|
---|
[7120] | 169 | my $title=$1;
|
---|
[7586] | 170 |
|
---|
[7120] | 171 | # is this title the name of a filename?
|
---|
| 172 | if (-r "$title.pdf" || -r "$title.html") {
|
---|
| 173 | # remove the title
|
---|
[7643] | 174 | $line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
|
---|
[7120] | 175 | }
|
---|
| 176 | }
|
---|
[2755] | 177 | }
|
---|
| 178 |
|
---|
[3410] | 179 | # relative hrefs to own document...
|
---|
| 180 | $line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
|
---|
[2715] | 181 | # escape underscores, but not if they're inside tags (eg img/href names)
|
---|
| 182 | my $inatag = 0; # allow multi-line tags
|
---|
| 183 | if ($line =~ /_/) {
|
---|
| 184 | my @parts=split('_',$line);
|
---|
| 185 | my $lastpart=pop @parts;
|
---|
| 186 | foreach my $part (@parts) {
|
---|
| 187 | if ($part =~ /<[^>]*$/) { # if we're starting a tag...
|
---|
| 188 | $inatag=1;
|
---|
| 189 | } elsif ($part =~ />[^<]*$/) { # closing a tag
|
---|
| 190 | $inatag=0;
|
---|
| 191 | }
|
---|
| 192 | if ($inatag) {
|
---|
| 193 | $part.='_';
|
---|
| 194 | } else {
|
---|
| 195 | $part.="_";
|
---|
| 196 | }
|
---|
| 197 | }
|
---|
| 198 | $line=join('',@parts,$lastpart);
|
---|
| 199 | }
|
---|
| 200 |
|
---|
[2599] | 201 | print OUTFILE $line;
|
---|
[7018] | 202 | $line = <INFILE>;
|
---|
[2599] | 203 | }
|
---|
| 204 | close INFILE;
|
---|
| 205 | close OUTFILE;
|
---|
[27757] | 206 | &FileUtils::removeFiles("$output_filestem.html.tmp");
|
---|
[2599] | 207 |
|
---|
[1928] | 208 | # Need to convert images from PPM format to PNG format
|
---|
| 209 | my @images;
|
---|
[2118] | 210 |
|
---|
[2743] | 211 | my $directory=$output_filestem;
|
---|
| 212 | $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
|
---|
[7120] | 213 | # newer versions of pdftohtml don't seem to do images this way anymore?
|
---|
[3720] | 214 | if (open (IMAGES, "${directory}images.log") ||
|
---|
| 215 | open (IMAGES, "${directory}image.log")) {
|
---|
| 216 | while (<IMAGES>) {
|
---|
| 217 | push (@images, $_);
|
---|
| 218 | }
|
---|
| 219 | close IMAGES;
|
---|
[27757] | 220 | &FileUtils::removeFiles("${directory}image.log") if (-e "${directory}image.log");
|
---|
[3720] | 221 |
|
---|
[1928] | 222 | }
|
---|
[2346] | 223 |
|
---|
[2755] | 224 | # no need to go any further if there is no text extracted from pdf.
|
---|
| 225 | if ($seen_textual_content == 0) {
|
---|
| 226 | print STDERR "Error: PDF contains no extractable text\n";
|
---|
| 227 | # remove images...
|
---|
| 228 | for $image (@images) {
|
---|
| 229 | chomp($image);
|
---|
[27757] | 230 | &FileUtils::removeFiles("${directory}$image");
|
---|
[2755] | 231 | }
|
---|
| 232 | return 1;
|
---|
| 233 | }
|
---|
| 234 |
|
---|
| 235 |
|
---|
| 236 |
|
---|
[1928] | 237 | for $image (@images) {
|
---|
| 238 | chomp($image);
|
---|
| 239 | my $cmd = "";
|
---|
| 240 | if ($ENV{'GSDLOS'} =~ /^windows/i) {
|
---|
[2743] | 241 | $cmd = "pnmtopng \"${directory}$image\"";
|
---|
[2118] | 242 | if (system($cmd)!=0) {
|
---|
[1928] | 243 | print STDERR "Error executing $cmd\n";
|
---|
[2755] | 244 | #return 1; # not sure about whether to leave this one in or take it out
|
---|
[2599] | 245 | next;
|
---|
[1928] | 246 | }
|
---|
| 247 | } else {
|
---|
| 248 | my @nameparts = split(/\./, $image);
|
---|
| 249 | my $image_base = shift(@nameparts);
|
---|
[2930] | 250 | $cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
|
---|
[2118] | 251 | if (system($cmd)!=0) {
|
---|
[24600] | 252 | $cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
|
---|
[2118] | 253 | if (system($cmd)!=0) {
|
---|
[2028] | 254 | print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
|
---|
[2755] | 255 | #return 1; # not sure about whether to leave this one in or take it out
|
---|
[2599] | 256 | next;
|
---|
[1928] | 257 | }
|
---|
| 258 | }
|
---|
| 259 | }
|
---|
[27757] | 260 | &FileUtils::removeFiles($image);
|
---|
[1928] | 261 | }
|
---|
| 262 |
|
---|
[2755] | 263 | return 0;
|
---|
[1928] | 264 | }
|
---|
| 265 |
|
---|
[2755] | 266 | # indicate our error status, 0 = success
|
---|
| 267 | exit (&main(@ARGV));
|
---|
| 268 |
|
---|