#!/usr/bin/perl -w ########################################################################### # # pdftohtml.pl -- convert documents to HTML ot TEXT format # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # pdftohtml.pl is a wrapper for running pdftohtml utility which converts # PDF documents to HTML, and converts images to PNG format for display in # the HTML pages generated BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); } use parsargv; use util; use Cwd; use File::Basename; sub print_usage { print STDERR ("pdftohtml version 0.22\n", "Usage: pdftohtml [options] \n", " -f : first page to convert\n", " -l : last page to convert\n", " -d : target directory (default: basename of pdf-file)\n", " -o : name of output file; - means stdout (default index.html)\n", " -q : don't print any messages or errors\n", " -h : print this usage information\n", " -p : exchange .pdf links by .html\n", " -c : generate complex HTML document\n", " -F : don't use frames in HTML document\n", " -i : ignore images\n", " -e : set extension for images (in the Html-file) (default png)\n" ); exit (1); } sub main { my (@ARGV) = @_; my ($first,$last,$target_dir,$out_file,$img_ext, $optq,$opth,$optp,$optc,$optF,$opti); # read command-line arguments so that # you can change the command in this script if (!parsargv::parse(\@ARGV, 'f/\d+/1', \$first, 'l/\d+/1', \$last, 'd/[\S]*/', \$target_dir, 'o/[\S]*/', \$out_file, 'e/[\S]*/', \$img_ext, 'q', \$optq, 'h', \$opth, 'p', \$optp, 'c', \$optc, 'F', \$optF, 'i', \$opti )) { print_usage(); } # Make sure the input file exists and can be opened for reading if (scalar(@ARGV!=2)) { print_usage(); } my $input_filename = $ARGV[0]; my $output_filestem = $ARGV[1]; $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix my @dir = split (/(\/|\\)/, $input_filename); pop(@dir); my $dir = join ("", @dir); if (!-r $input_filename) { print STDERR "Error: unable to open $input_filename for reading\n"; exit(1); } # Heuristical code added by John McPherson to attempt to reject # PDF's with no text in them.... based entirely on observation. We # should really read the PDF specifications someday... open (PDFIN, $input_filename) || die "Error: unable to open $input_filename for reading\n"; my $found_text_object=0; my $num_objects=0; my $non_text_objects=0; my $unenc_stream_objects=0; my $line; while (!$found_text_object && ($_=)) { s/\r/\n/g; if (/^\d+ \d+ obj/ms) { # start of new object my $object=""; $num_objects++; while (! eof && ! /(>>\s*)?endobj/) { $object.=$_; $_=; } if (!defined $_) {$_="";} # we've hit end of file in a funny place. # we've got to the end of the current PDF object. $object.=$_; # remove newline chars, to help our pattern matching for whitespace $object =~ s/\n/ /gs; #determine object type... $_=$object; # for PDFWriter , and pdflatex and distill. Eg: # "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..." # Ie this looks like compressed text.... if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) { $found_text_object=1; } # For pdflatex or ps2pdf from dvi->ps: # if we are setting a font, then following object is probably text # Eg "obj << /Font" or "obj << /ProcSet [...] /Font" elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) { $found_text_object=1; } # Unencoded streams. Eg # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..." elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s) { $unenc_stream_objects++; } # (some) non-text objects elsif (/<<.*\/(Type).*>>/s) { $non_text_objects++; } } else { # not in an object... # header? footer? # print $_; } if ($found_text_object) {close PDFIN;} } # end of while close PDFIN; # decide whether to accept or reject... # some of these numbers are completely arbitrary based on a few .pdfs. if ( ($found_text_object > 0) || ($num_objects<=1500 && $unenc_stream_objects > 5) ) { # accept this .pdf. Currently do nothing except fall through... } else { # reject this .pdf. print STDERR "pdftohtml.pl: $input_filename appears to have no "; print STDERR "textual data. Aborting.\n"; # print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n"; exit(1); } # formulate the command my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml.bin"); # don't include path on windows (to avoid having to play about # with quoting when GSDLHOME might contain spaces) but assume # that the PATH is set up correctly - note also that on windows # we use pdftohtml.exe not pdftohtml.bin $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/); if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";} $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\""; $cmd .= " > \"$output_filestem.out\""; # attempting to redirect STDERR on windows 95/98 is a bad idea $cmd .= " 2> \"$output_filestem.err\"" if $ENV{'GSDLOS'} !~ /^windows$/i; # system() returns -1 if it can't run, otherwise it's $cmds ret val. if (system($cmd)!=0) { print STDERR "Error executing $cmd: $!\n"; &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); return 0; } # Need to convert images from PPM format to PNG format my @images; open (IMAGES, "images.log"); while () { push (@images, $_); } close IMAGES; for $image (@images) { chomp($image); my $cmd = ""; if ($ENV{'GSDLOS'} =~ /^windows/i) { $cmd = "pnmtopng $image"; if (system($cmd)!=0) { print STDERR "Error executing $cmd\n"; return 0; # not sure about whether to leave this one in or take it out } } else { my @nameparts = split(/\./, $image); my $image_base = shift(@nameparts); $cmd = "pnmtopng $image > $image_base.png 2>/dev/null"; if (system($cmd)!=0) { $cmd = "convert $image $image_base.png 2>/dev/null"; if (system($cmd)!=0) { print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n"; return 0; # not sure about whether to leave this one in or take it out } } } &util::rm($image); } return 1; } &main(@ARGV);