source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 25500

Last change on this file since 25500 was 25500, checked in by ak19, 12 years ago

Dr Bainbridge modified gsprintf code to print text containing ampersand, less than and greater then with their entity values instead so that printing to STDERR from BEGIN statements (so far used only in PDFBoxConverter of the PDFBox extension) will play nicely with the XML generated for Pluginfo.pl. Pluginfo.pl has also been modified to use the correct gsprintf printing methods.

File size: 7.9 KB
Line 
1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32no strict 'subs'; # allow barewords (eg STDERR) as function arguments
33
34use gsprintf 'gsprintf';
35
36# these two variables mustn't be initialised here or they will get stuck
37# at those values.
38our $pdfbox_conversion_available;
39our $no_pdfbox_conversion_reason;
40
41BEGIN {
42 @PDFBoxConverter::ISA = ('BaseMediaConverter');
43
44 # Check that PDFBox is installed and available on the path
45 $pdfbox_conversion_available = 1;
46 $no_pdfbox_conversion_reason = "";
47
48 if (!defined $ENV{'GEXT_PDFBOX'}) {
49 $pdfbox_conversion_available = 0;
50 $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
51 }
52 else {
53 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
54 my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
55
56 if (!-e $pbajar) {
57 &gsprintf(STDERR,"**** Failed to find $pbajar\n");
58 $pdfbox_conversion_available = 0;
59 $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
60 }
61 else {
62 # test to see if java is in path
63 # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
64 # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
65 # while %ERRORLEVEL% is 1 for JDK 1.7*
66 # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
67 # installed, regardless of whether the JDK version is 1.6* or 1.7*.
68 my $cmd = "java -version";
69 if ($ENV{'GSDLOS'} =~ /^windows/i) {
70 $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
71 }
72 else {
73 # On Ubuntu, java >/dev/null 2>&1 works,
74 # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
75 $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
76 }
77
78 my $status = system($cmd);
79
80 if ($status != 0) {
81
82 my $error_message = "**** Testing for java\n";
83 $error_message .= "Failed to run: $cmd\n";
84 $error_message .= "Error variable: |$!| and status: $status\n";
85
86 &gsprintf(STDERR, "PDFBoxConverter: $error_message");
87
88 $pdfbox_conversion_available = 0;
89 $no_pdfbox_conversion_reason = "couldnotrunjava";
90 }
91 }
92 }
93
94}
95
96my $arguments = [ ];
97
98my $options = { 'name' => "PDFBoxConverter",
99 'desc' => "{PDFBoxConverter.desc}",
100 'abstract' => "yes",
101 'inherits' => "yes",
102 'args' => $arguments };
103
104sub new {
105 my ($class) = shift (@_);
106 my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
107 push(@$pluginlist, $class);
108
109 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 push(@{$hashArgOptLists->{"OptList"}},$options);
111
112
113 my $self = new BaseMediaConverter($pluginlist, $inputargs,
114 $hashArgOptLists, $auxilary);
115
116 if ($self->{'info_only'}) {
117 # don't worry about any options etc
118 return bless $self, $class;
119 }
120 if ($pdfbox_conversion_available) {
121 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
122 my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
123 my $launch_cmd = "java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText";
124
125 $self->{'pdfbox_launch_cmd'} = $launch_cmd;
126 }
127 else {
128 $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
129
130 my $outhandle = $self->{'outhandle'};
131 &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
132 }
133
134 $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
135
136 return bless $self, $class;
137
138}
139
140sub init {
141 my $self = shift(@_);
142 my ($verbosity, $outhandle, $failhandle) = @_;
143
144 $self->{'pbtmp_file_paths'} = ();
145}
146
147sub deinit {
148 my $self = shift(@_);
149
150 $self->clean_up_temporary_files();
151}
152
153
154sub convert {
155 my $self = shift(@_);
156 my ($source_file_full_path, $target_file_type) = @_;
157
158 return 0 unless $pdfbox_conversion_available;
159 # check the filename
160 return 0 if ( !-f $source_file_full_path);
161
162 # the following line is necessary to avoid 'uninitialised variable' error
163 # messages concerning the converted_to member variable when PDFPlugin's
164 # use_sections option is checked.
165 # PDFBox plugin now processes use_sections option, when working with v1.5.0
166 # of the PDFBox jar file (which embeds each page in special <div> tags).
167 if ($target_file_type eq "html") {
168 $self->{'converted_to'} = "HTML";
169 } else {
170 $self->{'converted_to'} = "text";
171 }
172
173 my $outhandle = $self->{'outhandle'};
174 my $verbosity = $self->{'verbosity'};
175
176 my $source_file_no_path = &File::Basename::basename($source_file_full_path);
177 # Determine the full name and path of the output file
178 my $target_file_path;
179 if ($self->{'enable_cache'}) {
180 $self->init_cache_for_file($source_file_full_path);
181 my $cache_dir = $self->{'cached_dir'};
182 my $file_root = $self->{'cached_file_root'};
183 #$file_root .= "_$convert_id" if ($convert_id ne "");
184 my $target_file = "$file_root.$target_file_type";
185 $target_file_path = &util::filename_cat($cache_dir,$target_file);
186 }
187 else {
188 # this is in gsdl/tmp. get a tmp filename in collection instead???
189 $target_file_path = &util::get_tmp_filename($target_file_type);
190 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
191 }
192
193 # Generate and run the convert command
194 my $convert_cmd = $self->{'pdfbox_launch_cmd'};
195 $convert_cmd .= " -html" if ($target_file_type eq "html");
196 $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
197
198 if ($verbosity>2) {
199 &gsprintf($outhandle,"Convert command: $convert_cmd\n");
200 }
201
202 my $print_info = { 'message_prefix' => "PDFBox Conversion",
203 'message' => "Converting $source_file_no_path to: $target_file_type" };
204 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
205
206 my ($regenerated,$result,$had_error)
207 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
208 if ($had_error) {
209 return (0, $result,$target_file_path);
210 }
211 return (1, $result,$target_file_path);
212}
213
214sub convert_without_result {
215 my $self = shift(@_);
216
217 my $source_file_path = shift(@_);
218 my $target_file_type = shift(@_);
219 my $convert_options = shift(@_) || "";
220 my $convert_id = shift(@_) || "";
221
222 return $self->convert($source_file_path,$target_file_type,
223 $convert_options,$convert_id,"without_result");
224}
225
226sub clean_up_temporary_files {
227 my $self = shift(@_);
228
229 foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
230 if (-e $pbtmp_file_path) {
231 &util::rm($pbtmp_file_path);
232 }
233 }
234
235 $self->{'pbtmp_file_paths'} = ();
236}
237
238
239
2401;
Note: See TracBrowser for help on using the repository browser.