###########################################################################
#
# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 2010 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package PDFBoxConverter;
use BaseMediaConverter;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
no strict 'subs'; # allow barewords (eg STDERR) as function arguments
#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
use gsprintf 'gsprintf';
use FileUtils;
# these two variables mustn't be initialised here or they will get stuck
# at those values.
our $pdfbox_conversion_available;
our $no_pdfbox_conversion_reason;
BEGIN {
@PDFBoxConverter::ISA = ('BaseMediaConverter');
# Check that PDFBox is installed and available on the path
$pdfbox_conversion_available = 1;
$no_pdfbox_conversion_reason = "";
if (!defined $ENV{'GEXT_PDFBOX'}) {
$pdfbox_conversion_available = 0;
$no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
}
else {
my $gextpb_home = $ENV{'GEXT_PDFBOX'};
my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
if (!-e $pbajar) {
&gsprintf(STDERR,"**** Failed to find $pbajar\n");
$pdfbox_conversion_available = 0;
$no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
}
else {
# test to see if java is in path
# Need to run java -version instead of just java, since the %ERRORLEVEL% returned
# for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
# while %ERRORLEVEL% is 1 for JDK 1.7*
# If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
# installed, regardless of whether the JDK version is 1.6* or 1.7*.
my $java = &util::get_java_command();
my $cmd = "$java -version";
if ($ENV{'GSDLOS'} =~ /^windows/i) {
$cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
}
else {
# On Ubuntu, java >/dev/null 2>&1 works,
# but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
$cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
}
my $status = system($cmd);
if ($status != 0) {
my $error_message = "**** Testing for java\n";
$error_message .= "Failed to run: $cmd\n";
$error_message .= "Error variable: |$!| and status: $status\n";
&gsprintf(STDERR, "PDFBoxConverter: $error_message");
$pdfbox_conversion_available = 0;
$no_pdfbox_conversion_reason = "couldnotrunjava";
}
}
}
}
my $arguments = [ ];
my $options = { 'name' => "PDFBoxConverter",
'desc' => "{PDFBoxConverter.desc}",
'abstract' => "yes",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
push(@$pluginlist, $class);
push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
push(@{$hashArgOptLists->{"OptList"}},$options);
my $self = new BaseMediaConverter($pluginlist, $inputargs,
$hashArgOptLists, $auxilary);
if ($self->{'info_only'}) {
# don't worry about any options etc
return bless $self, $class;
}
if ($pdfbox_conversion_available) {
my $gextpb_home = $ENV{'GEXT_PDFBOX'};
my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
my $java = &util::get_java_command();
$self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
$self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"
\" org.apache.pdfbox.tools.ExtractText";
# $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
# We use this next cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
# AND its extracted text. Or just each page's extracted text. An item file is still generated,
# but this time referring to txtfiles too, not just the images. Result: searchable paged output.
# Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build");
my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);
$self->{'pdfbox_imgtxt_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
}
else {
$self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
my $outhandle = $self->{'outhandle'};
&gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
}
$self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
return bless $self, $class;
}
sub init {
my $self = shift(@_);
my ($verbosity, $outhandle, $failhandle) = @_;
$self->{'pbtmp_file_paths'} = ();
}
sub deinit {
my $self = shift(@_);
$self->clean_up_temporary_files();
}
sub convert {
my $self = shift(@_);
my ($source_file_full_path, $target_file_type) = @_;
return 0 unless $pdfbox_conversion_available;
# check the filename
return 0 if ( !-f $source_file_full_path);
# Although PDFBoxConverter inherits from AutoLoadConverters and therefore
# doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options
# in case in future PDFBoxConverter no longer inherits from AutoLoadConverters
# and ends up going through gsConvert.pl
$self->{'convert_options'} .= " -pdf_tool pdfbox";
my $img_output_mode = 0;
my $convert_to = $self->{'convert_to'};
my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0;
# the following line is necessary to avoid 'uninitialised variable' error
# messages concerning the converted_to member variable when PDFPlugin's
# use_sections option is checked.
# PDFBox plugin now processes use_sections option, when working with v1.5.0
# of the PDFBox jar file (which embeds each page in special