#!/usr/bin/perl -w


###########################################################################
#
# pdftohtml.pl -- convert documents to HTML ot TEXT format
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
# PDF documents to HTML, and converts images to PNG format for display in
# the HTML pages generated

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
}

use parsargv;
use util;
use Cwd;
use File::Basename;

sub print_usage {
print STDERR  
    ("pdftohtml version 0.22\n",
     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
     "  -f <int>      : first page to convert\n",
     "  -l <int>      : last page to convert\n",
     "  -d <dir>      : target directory (default: basename of pdf-file)\n",
     "  -o <file>     : name of output file; - means stdout (default index.html)\n",
     "  -q            : don't print any messages or errors\n",
     "  -h            : print this usage information\n",
     "  -p            : exchange .pdf links by .html\n",
     "  -c            : generate complex HTML document\n",
     "  -F            : don't use frames in HTML document\n",
     "  -i            : ignore images\n",
     "  -e <string>   : set extension for images (in the Html-file) (default png)\n"
     );
exit (1);
}

sub main {
    my (@ARGV) = @_;
    my ($first,$last,$target_dir,$out_file,$img_ext,
	$optq,$opth,$optp,$optc,$optF,$opti);
    
    # read command-line arguments so that
    # you can change the command in this script
    if (!parsargv::parse(\@ARGV,
			 'f/\d+/1', \$first,
			 'l/\d+/1', \$last,
			 'd/[\S]*/', \$target_dir,
			 'o/[\S]*/', \$out_file,
			 'e/[\S]*/', \$img_ext,
			 'q', \$optq,
			 'h', \$opth,
			 'p', \$optp,
			 'c', \$optc,
			 'F', \$optF,
			 'i', \$opti
			 ))
    {
	print_usage();
    }

    # Make sure the input file exists and can be opened for reading
    if (scalar(@ARGV!=2)) {
	print_usage();
    }

    my $input_filename = $ARGV[0];
    my $output_filestem = $ARGV[1];
    $output_filestem =~ s/\.html$//; # pdftohtml adds this suffix

    my @dir = split (/(\/|\\)/, $input_filename);
    pop(@dir);
    my $dir = join ("", @dir);

    if (!-r $input_filename) {
	print STDERR "Error: unable to open $input_filename for reading\n";
	exit(1);
    }

    # Heuristical code added by John McPherson to attempt to reject
    # PDF's with no text in them.... based entirely on observation. We 
    # should really read the PDF specifications someday...
    open (PDFIN, $input_filename) || 
	die "Error: unable to open $input_filename for reading\n";

    my $found_text_object=0;
    my $num_objects=0;
    my $non_text_objects=0;
    my $unenc_stream_objects=0;
    my $line;
    while (!$found_text_object && ($_=<PDFIN>)) {
	s/\r/\n/g;
	if (/^\d+ \d+ obj/ms) {
	    # start of new object
	    my $object="";
	    $num_objects++;
	    while (! eof && ! /(>>\s*)?endobj/) {
		$object.=$_;
		$_=<PDFIN>;
	    }
	    if (!defined $_) {$_="";} # we've hit end of file in a funny place.
	    # we've got to the end of the current PDF object.
	    $object.=$_;
	    
	    # remove newline chars, to help our pattern matching for whitespace
	    $object =~ s/\n/ /gs;

	    #determine object type...
	    $_=$object;
	    
# for PDFWriter , and pdflatex and distill. Eg:
# "12 0 obj << /Length 13 0 R /Filter /LZWDecode >> stream ..."
# Ie this looks like compressed text....
	    if (/\d+\s+\d+\s+obj\s+<<\s+\/Length\s+\d+\s+\d+\s*.\s*\/Filter/) {
		$found_text_object=1;
	    } 
	    # For pdflatex or ps2pdf from dvi->ps:
	    # if we are setting a font, then following object is probably text
	    # Eg "obj << /Font" or "obj << /ProcSet [...] /Font"
	    elsif (/obj\s*<<\s*(\/ProcSet \[.+?\]\s*)?\/Font /s) { 
		$found_text_object=1;
	    }
	    # Unencoded streams. Eg
	    # "<< /Length 45 0 R >> stream BT /R43 8.96638 Tf 1..."
	    elsif (/<<\s+\/Length\s+\d+\s+\d+\s+R\s+>>\s+stream\s+(q\s)?BT\s/s)
	    {
		$unenc_stream_objects++;
	    }
	    # (some) non-text objects
	    elsif (/<<.*\/(Type).*>>/s) {
		$non_text_objects++;
	    } 

	} else { # not in an object...
	    # header? footer?
#       print $_;
	}
	if ($found_text_object) {close PDFIN;}

    } # end of while
    close PDFIN;
    
    # decide whether to accept or reject...
    # some of these numbers are completely arbitrary based on a few .pdfs.
    if ( ($found_text_object > 0) ||
	 ($num_objects<=1500 && $unenc_stream_objects > 5)
	 ) 
    {
	# accept this .pdf. Currently do nothing except fall through...
    } else {
	# reject this .pdf.
	print STDERR "pdftohtml.pl: $input_filename appears to have no ";
	print STDERR "textual data. Aborting.\n";
	# print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";
	exit(1);
    }

    # formulate the command
    my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml.bin");

    # don't include path on windows (to avoid having to play about
    # with quoting when GSDLHOME might contain spaces) but assume
    # that the PATH is set up correctly - note also that on windows
    # we use pdftohtml.exe not pdftohtml.bin
    $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);

    if ($timeout) {$cmd = "ulimit -t $timeout; $cmd";}
    $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
    $cmd .= " > \"$output_filestem.out\"";

    # attempting to redirect STDERR on windows 95/98 is a bad idea
    $cmd .= " 2> \"$output_filestem.err\"" 
	if $ENV{'GSDLOS'} !~ /^windows$/i;

# system() returns -1 if it can't run, otherwise it's $cmds ret val.
    if (system($cmd)!=0) {
	print STDERR "Error executing $cmd: $!\n";
	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	return 0;
    }

    # Need to convert images from PPM format to PNG format
    my @images;


    open (IMAGES, "images.log");
    while (<IMAGES>) {
	push (@images, $_);
    }
    close IMAGES;
    
    for $image (@images) {
	chomp($image);
	my $cmd = "";
	if ($ENV{'GSDLOS'} =~ /^windows/i) {
	    $cmd = "pnmtopng $image";
	    if (system($cmd)!=0) {
		print STDERR "Error executing $cmd\n";
		return 0; # not sure about whether to leave this one in or take it out
	    }
	} else {
	    my @nameparts = split(/\./, $image);
	    my $image_base = shift(@nameparts);

	    $cmd = "pnmtopng $image > $image_base.png 2>/dev/null";
	    if (system($cmd)!=0) {
		$cmd = "convert $image $image_base.png 2>/dev/null";
		if (system($cmd)!=0) {
		    print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
		    return 0; # not sure about whether to leave this one in or take it out
		}
	    }
	}
	&util::rm($image);
    }

    return 1;
}

&main(@ARGV);