#!/usr/bin/perl -w ########################################################################### # # pdftohtml.pl -- convert documents to HTML or TEXT format # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # pdftohtml.pl is a wrapper for running pdftohtml utility which converts # PDF documents to HTML, and converts images to PNG format for display in # the HTML pages generated BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); } use parsargv; use util; use Cwd; use File::Basename; sub print_usage { # note - we don't actually ever use most of these options... print STDERR ("pdftohtml version 0.22 - modified for NZDL use\n", "Usage: pdftohtml [options] \n", " -f : first page to convert\n", " -l : last page to convert\n", " -d : target directory (default: basename of pdf-file)\n", " -o : name of output file; - means stdout (default index.html)\n", " -q : don't print any messages or errors\n", " -h : print this usage information\n", " -p : exchange .pdf links by .html\n", # these options now have no effect in gs-custom pdftohtml # " -c : generate complex HTML document\n", # " -F : don't use frames in HTML document\n", " -i : ignore images\n", " -e : set extension for images (in the Html-file) (default png)\n" ); exit (1); } sub main { my (@ARGV) = @_; my ($first,$last,$target_dir,$out_file,$img_ext, $optq,$opth,$optp,$optF,$opti); # read command-line arguments so that # you can change the command in this script if (!parsargv::parse(\@ARGV, 'f/\d+/1', \$first, 'l/\d+/1', \$last, 'd/[\S]*/', \$target_dir, 'o/[\S]*/', \$out_file, 'e/[\S]*/', \$img_ext, 'q', \$optq, 'h', \$opth, 'p', \$optp, # 'c', \$optc, 'F', \$optF, 'i', \$opti )) { print_usage(); } # Make sure the input file exists and can be opened for reading if (scalar(@ARGV!=2)) { print_usage(); } my $input_filename = $ARGV[0]; my $output_filestem = $ARGV[1]; $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix my @dir = split (/(\/|\\)/, $input_filename); pop(@dir); my $dir = join ("", @dir); if (!-r $input_filename) { print STDERR "Error: unable to open $input_filename for reading\n"; exit(1); } # Heuristical code removed due to pdftohtml being "fixed" to not # create bitmaps for each char in some pdfs. However, this means we # now create .html files even if we can't extract any text. We should # check for that now instead someday... # formulate the command my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml"); # don't include path on windows (to avoid having to play about # with quoting when GSDLHOME might contain spaces) but assume # that the PATH is set up correctly. $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/); if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";} $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\""; $cmd .= " > \"$output_filestem.out\""; # attempting to redirect STDERR on windows 95/98 is a bad idea $cmd .= " 2> \"$output_filestem.err\"" if $ENV{'GSDLOS'} !~ /^windows$/i; # system() returns -1 if it can't run, otherwise it's $cmds ret val. if (system($cmd)!=0) { print STDERR "Error executing $cmd: $!\n"; &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); return 0; } # post-process to remove and , as these break up # words, screwing up indexing and searching. &util::mv("$output_filestem.html","$output_filestem.html.tmp"); open INFILE, "$output_filestem.html.tmp" || die "Couldn't open file: $!"; open OUTFILE, ">$output_filestem.html" || die "Couldn't open file for writing: $!"; my $line; while ($line=) { $line =~ s###g; $line =~ s###g; print OUTFILE $line; } close INFILE; close OUTFILE; &util::rm("$output_filestem.html.tmp"); # Need to convert images from PPM format to PNG format my @images; open (IMAGES, "images.log") || open (IMAGES, "image.log") || print STDERR "Error opening image log:$!\n"; while () { push (@images, $_); } close IMAGES; for $image (@images) { chomp($image); my $cmd = ""; if ($ENV{'GSDLOS'} =~ /^windows/i) { $cmd = "pnmtopng $image"; if (system($cmd)!=0) { print STDERR "Error executing $cmd\n"; #return 0; # not sure about whether to leave this one in or take it out next; } } else { my @nameparts = split(/\./, $image); my $image_base = shift(@nameparts); $cmd = "pnmtopng $image > $image_base.png 2>/dev/null"; if (system($cmd)!=0) { $cmd = "convert $image $image_base.png 2>/dev/null"; if (system($cmd)!=0) { print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n"; #return 0; # not sure about whether to leave this one in or take it out next; } } } &util::rm($image); } return 1; } # indicate our error status if (&main(@ARGV)) {exit 0;} exit 1;