source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 27510

Last change on this file since 27510 was 27510, checked in by ak19, 11 years ago

Using the recommended FileUtils.pm methods in place of the deprecated utils.pm methods.

File size: 10.3 KB
RevLine 
[22669]1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
[25500]32no strict 'subs'; # allow barewords (eg STDERR) as function arguments
[22669]33
34use gsprintf 'gsprintf';
35
36# these two variables mustn't be initialised here or they will get stuck
37# at those values.
38our $pdfbox_conversion_available;
39our $no_pdfbox_conversion_reason;
40
41BEGIN {
42 @PDFBoxConverter::ISA = ('BaseMediaConverter');
43
44 # Check that PDFBox is installed and available on the path
45 $pdfbox_conversion_available = 1;
46 $no_pdfbox_conversion_reason = "";
47
48 if (!defined $ENV{'GEXT_PDFBOX'}) {
49 $pdfbox_conversion_available = 0;
50 $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
51 }
52 else {
53 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
[27510]54 my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
[22669]55
56 if (!-e $pbajar) {
[25500]57 &gsprintf(STDERR,"**** Failed to find $pbajar\n");
[22669]58 $pdfbox_conversion_available = 0;
59 $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
60 }
61 else {
[25211]62 # test to see if java is in path
63 # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
64 # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
65 # while %ERRORLEVEL% is 1 for JDK 1.7*
66 # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
67 # installed, regardless of whether the JDK version is 1.6* or 1.7*.
[25513]68 my $java = &util::get_java_command();
69
[25511]70 my $cmd = "$java -version";
[22669]71 if ($ENV{'GSDLOS'} =~ /^windows/i) {
[24676]72 $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
[22669]73 }
74 else {
[24676]75 # On Ubuntu, java >/dev/null 2>&1 works,
76 # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
[25498]77 $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
[22669]78 }
79
80 my $status = system($cmd);
[22700]81
[22669]82 if ($status != 0) {
[25500]83
84 my $error_message = "**** Testing for java\n";
85 $error_message .= "Failed to run: $cmd\n";
86 $error_message .= "Error variable: |$!| and status: $status\n";
87
88 &gsprintf(STDERR, "PDFBoxConverter: $error_message");
89
[22669]90 $pdfbox_conversion_available = 0;
[22863]91 $no_pdfbox_conversion_reason = "couldnotrunjava";
[22669]92 }
93 }
94 }
95
96}
97
98my $arguments = [ ];
99
100my $options = { 'name' => "PDFBoxConverter",
101 'desc' => "{PDFBoxConverter.desc}",
102 'abstract' => "yes",
103 'inherits' => "yes",
104 'args' => $arguments };
105
106sub new {
107 my ($class) = shift (@_);
[22700]108 my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
[22669]109 push(@$pluginlist, $class);
110
111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
113
114
[22700]115 my $self = new BaseMediaConverter($pluginlist, $inputargs,
116 $hashArgOptLists, $auxilary);
117
[22669]118 if ($self->{'info_only'}) {
119 # don't worry about any options etc
120 return bless $self, $class;
121 }
122 if ($pdfbox_conversion_available) {
123 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
[27510]124 my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
[25513]125 my $java = &util::get_java_command();
[25511]126 my $launch_cmd = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText";
[22669]127
128 $self->{'pdfbox_launch_cmd'} = $launch_cmd;
[25995]129 $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.PDFToImage"; # cmd for converting pages to images (gif, jpg, png)
[22669]130 }
131 else {
132 $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
133
134 my $outhandle = $self->{'outhandle'};
135 &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
136 }
137
138 $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
139
140 return bless $self, $class;
141
142}
143
144sub init {
145 my $self = shift(@_);
146 my ($verbosity, $outhandle, $failhandle) = @_;
147
148 $self->{'pbtmp_file_paths'} = ();
149}
150
151sub deinit {
152 my $self = shift(@_);
153
154 $self->clean_up_temporary_files();
155}
156
157
158sub convert {
159 my $self = shift(@_);
160 my ($source_file_full_path, $target_file_type) = @_;
161
162 return 0 unless $pdfbox_conversion_available;
163 # check the filename
164 return 0 if ( !-f $source_file_full_path);
165
[25995]166 my $img_output_mode = 0;
167
[24141]168 # the following line is necessary to avoid 'uninitialised variable' error
169 # messages concerning the converted_to member variable when PDFPlugin's
170 # use_sections option is checked.
[24200]171 # PDFBox plugin now processes use_sections option, when working with v1.5.0
172 # of the PDFBox jar file (which embeds each page in special <div> tags).
173 if ($target_file_type eq "html") {
174 $self->{'converted_to'} = "HTML";
[25995]175 } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
176 $self->{'converted_to'} = $target_file_type;
177 $img_output_mode = 1;
[24200]178 } else {
179 $self->{'converted_to'} = "text";
180 }
[24141]181
[22703]182 my $outhandle = $self->{'outhandle'};
183 my $verbosity = $self->{'verbosity'};
184
[22669]185 my $source_file_no_path = &File::Basename::basename($source_file_full_path);
186 # Determine the full name and path of the output file
187 my $target_file_path;
188 if ($self->{'enable_cache'}) {
189 $self->init_cache_for_file($source_file_full_path);
190 my $cache_dir = $self->{'cached_dir'};
191 my $file_root = $self->{'cached_file_root'};
192 #$file_root .= "_$convert_id" if ($convert_id ne "");
[25995]193
194 # append the output filetype suffix only for non-image output formats, since for
195 # images we can be outputting multiple image files per single PDF input file
196 my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
197
[27510]198 $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
[22669]199 }
200 else {
201 # this is in gsdl/tmp. get a tmp filename in collection instead???
202 $target_file_path = &util::get_tmp_filename($target_file_type);
[25995]203
204 # for image files, remove the suffix, since we can have many output image files
205 # per input PDF (one img for each page of the PDF, for example)
206 if($img_output_mode) {
207 $target_file_path =~ s/\.[^.]*$//g;
[27510]208 if(!&FileUtils::directoryExists($target_file_path)) {
[25995]209 mkdir($target_file_path);
210 }
211
212 # once the item file for the imgs has been created, need to adjust target_file_path
213
214 # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
215 # item file generated in it can be deleted in one go on clean_up
216 }
217
[22669]218 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
219 }
220
221 # Generate and run the convert command
[25995]222 my $convert_cmd = "";
[22669]223
[25995]224 # want the filename without extension, because any images
225 # are to be generated with the same filename as the PDF
226 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
227
228 if($img_output_mode) { # converting to images
[27510]229 my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
[25995]230
231 $convert_cmd = $self->{'pdfbox_img_launch_cmd'};
232 $convert_cmd .= " -imageType $target_file_type";
[27410]233 $convert_cmd .= " -outputPrefix \"$output_prefix\"";
[25995]234 $convert_cmd .= " \"$source_file_full_path\"";
235
236 } else { # html or text
237 $convert_cmd = $self->{'pdfbox_launch_cmd'};
238 $convert_cmd .= " -html" if ($target_file_type eq "html");
239 $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
240 }
241
[22703]242 if ($verbosity>2) {
[25500]243 &gsprintf($outhandle,"Convert command: $convert_cmd\n");
[22703]244 }
[22669]245
246 my $print_info = { 'message_prefix' => "PDFBox Conversion",
247 'message' => "Converting $source_file_no_path to: $target_file_type" };
248 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
249
250 my ($regenerated,$result,$had_error)
[22700]251 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
[25995]252
253 if($img_output_mode) {
254 # now the images have been generated, generate the "$target_file_path/tailname.item"
255 # item file for them, which is also the target_file_path that needs to be returned
256 $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
257 #print STDERR "**** item file: $target_file_path\n";
258 }
259
[22669]260 if ($had_error) {
261 return (0, $result,$target_file_path);
262 }
263 return (1, $result,$target_file_path);
264}
265
266sub convert_without_result {
267 my $self = shift(@_);
268
269 my $source_file_path = shift(@_);
270 my $target_file_type = shift(@_);
271 my $convert_options = shift(@_) || "";
272 my $convert_id = shift(@_) || "";
273
274 return $self->convert($source_file_path,$target_file_type,
275 $convert_options,$convert_id,"without_result");
276}
277
278sub clean_up_temporary_files {
279 my $self = shift(@_);
280
281 foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
[25995]282 if (-d $pbtmp_file_path) {
283 #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
[27510]284 &FileUtils::removeFilesRecursive($pbtmp_file_path);
[25995]285 }
286 elsif (-e $pbtmp_file_path) {
[27510]287 &FileUtils::removeFiles($pbtmp_file_path);
[22669]288 }
289 }
290
291 $self->{'pbtmp_file_paths'} = ();
292}
293
294
2951;
Note: See TracBrowser for help on using the repository browser.