###########################################################################
#
# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 2010 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package PDFBoxConverter;
use BaseMediaConverter;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
use gsprintf 'gsprintf';
# these two variables mustn't be initialised here or they will get stuck
# at those values.
our $pdfbox_conversion_available;
our $no_pdfbox_conversion_reason;
BEGIN {
@PDFBoxConverter::ISA = ('BaseMediaConverter');
# Check that PDFBox is installed and available on the path
$pdfbox_conversion_available = 1;
$no_pdfbox_conversion_reason = "";
if (!defined $ENV{'GEXT_PDFBOX'}) {
$pdfbox_conversion_available = 0;
$no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
}
else {
my $gextpb_home = $ENV{'GEXT_PDFBOX'};
my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
if (!-e $pbajar) {
print STDERR "Failed to find $pbajar\n";
$pdfbox_conversion_available = 0;
$no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
}
else {
# test to see if java is in path
my $cmd = "java";
if ($ENV{'GSDLOS'} =~ /^windows/i) {
$cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
}
else {
# On Ubuntu, java >/dev/null 2>&1 works,
# but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
$cmd .= " >/dev/null 2>&1";
}
my $status = system($cmd);
if ($status != 0) {
print STDERR "Testing for java\n";
print STDERR "Failed to run: $cmd\n";
print STDERR "$!\n";
$pdfbox_conversion_available = 0;
$no_pdfbox_conversion_reason = "couldnotrunjava";
}
}
}
}
my $arguments = [ ];
my $options = { 'name' => "PDFBoxConverter",
'desc' => "{PDFBoxConverter.desc}",
'abstract' => "yes",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
push(@$pluginlist, $class);
push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
push(@{$hashArgOptLists->{"OptList"}},$options);
my $self = new BaseMediaConverter($pluginlist, $inputargs,
$hashArgOptLists, $auxilary);
if ($self->{'info_only'}) {
# don't worry about any options etc
return bless $self, $class;
}
if ($pdfbox_conversion_available) {
my $gextpb_home = $ENV{'GEXT_PDFBOX'};
my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
my $launch_cmd = "java -cp \"$pbajar\" -Dline.separator=\"
\" org.apache.pdfbox.ExtractText";
$self->{'pdfbox_launch_cmd'} = $launch_cmd;
}
else {
$self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
my $outhandle = $self->{'outhandle'};
&gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
}
$self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
return bless $self, $class;
}
sub init {
my $self = shift(@_);
my ($verbosity, $outhandle, $failhandle) = @_;
$self->{'pbtmp_file_paths'} = ();
}
sub deinit {
my $self = shift(@_);
$self->clean_up_temporary_files();
}
sub convert {
my $self = shift(@_);
my ($source_file_full_path, $target_file_type) = @_;
return 0 unless $pdfbox_conversion_available;
# check the filename
return 0 if ( !-f $source_file_full_path);
# the following line is necessary to avoid 'uninitialised variable' error
# messages concerning the converted_to member variable when PDFPlugin's
# use_sections option is checked.
# PDFBox plugin now processes use_sections option, when working with v1.5.0
# of the PDFBox jar file (which embeds each page in special