source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 25498

Last change on this file since 25498 was 25498, checked in by ak19, 12 years ago

On Centos don't need & at end of running cmd sent to dev/null with stderr merged with stdout. Ubuntu didn't need this anyway. This commit will be followed by another, then the zip and tar files of the extension will be generated

File size: 7.7 KB
Line 
1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use gsprintf 'gsprintf';
34
35# these two variables mustn't be initialised here or they will get stuck
36# at those values.
37our $pdfbox_conversion_available;
38our $no_pdfbox_conversion_reason;
39
40BEGIN {
41 @PDFBoxConverter::ISA = ('BaseMediaConverter');
42
43 # Check that PDFBox is installed and available on the path
44 $pdfbox_conversion_available = 1;
45 $no_pdfbox_conversion_reason = "";
46
47 if (!defined $ENV{'GEXT_PDFBOX'}) {
48 $pdfbox_conversion_available = 0;
49 $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
50 }
51 else {
52 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
53 my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
54
55 if (!-e $pbajar) {
56 print STDERR "Failed to find $pbajar\n";
57 $pdfbox_conversion_available = 0;
58 $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
59 }
60 else {
61 # test to see if java is in path
62 # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
63 # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
64 # while %ERRORLEVEL% is 1 for JDK 1.7*
65 # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
66 # installed, regardless of whether the JDK version is 1.6* or 1.7*.
67 my $cmd = "java -version";
68 if ($ENV{'GSDLOS'} =~ /^windows/i) {
69 $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
70 }
71 else {
72 # On Ubuntu, java >/dev/null 2>&1 works,
73 # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
74 $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
75 }
76
77 my $status = system($cmd);
78
79 if ($status != 0) {
80 print STDERR "Testing for java\n";
81 print STDERR "Failed to run: $cmd\n";
82 print STDERR "$!\n";
83 $pdfbox_conversion_available = 0;
84 $no_pdfbox_conversion_reason = "couldnotrunjava";
85 }
86 }
87 }
88
89}
90
91my $arguments = [ ];
92
93my $options = { 'name' => "PDFBoxConverter",
94 'desc' => "{PDFBoxConverter.desc}",
95 'abstract' => "yes",
96 'inherits' => "yes",
97 'args' => $arguments };
98
99sub new {
100 my ($class) = shift (@_);
101 my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
102 push(@$pluginlist, $class);
103
104 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
105 push(@{$hashArgOptLists->{"OptList"}},$options);
106
107
108 my $self = new BaseMediaConverter($pluginlist, $inputargs,
109 $hashArgOptLists, $auxilary);
110
111 if ($self->{'info_only'}) {
112 # don't worry about any options etc
113 return bless $self, $class;
114 }
115 if ($pdfbox_conversion_available) {
116 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
117 my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
118 my $launch_cmd = "java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText";
119
120 $self->{'pdfbox_launch_cmd'} = $launch_cmd;
121 }
122 else {
123 $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
124
125 my $outhandle = $self->{'outhandle'};
126 &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
127 }
128
129 $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
130
131 return bless $self, $class;
132
133}
134
135sub init {
136 my $self = shift(@_);
137 my ($verbosity, $outhandle, $failhandle) = @_;
138
139 $self->{'pbtmp_file_paths'} = ();
140}
141
142sub deinit {
143 my $self = shift(@_);
144
145 $self->clean_up_temporary_files();
146}
147
148
149sub convert {
150 my $self = shift(@_);
151 my ($source_file_full_path, $target_file_type) = @_;
152
153 return 0 unless $pdfbox_conversion_available;
154 # check the filename
155 return 0 if ( !-f $source_file_full_path);
156
157 # the following line is necessary to avoid 'uninitialised variable' error
158 # messages concerning the converted_to member variable when PDFPlugin's
159 # use_sections option is checked.
160 # PDFBox plugin now processes use_sections option, when working with v1.5.0
161 # of the PDFBox jar file (which embeds each page in special <div> tags).
162 if ($target_file_type eq "html") {
163 $self->{'converted_to'} = "HTML";
164 } else {
165 $self->{'converted_to'} = "text";
166 }
167
168 my $outhandle = $self->{'outhandle'};
169 my $verbosity = $self->{'verbosity'};
170
171 my $source_file_no_path = &File::Basename::basename($source_file_full_path);
172 # Determine the full name and path of the output file
173 my $target_file_path;
174 if ($self->{'enable_cache'}) {
175 $self->init_cache_for_file($source_file_full_path);
176 my $cache_dir = $self->{'cached_dir'};
177 my $file_root = $self->{'cached_file_root'};
178 #$file_root .= "_$convert_id" if ($convert_id ne "");
179 my $target_file = "$file_root.$target_file_type";
180 $target_file_path = &util::filename_cat($cache_dir,$target_file);
181 }
182 else {
183 # this is in gsdl/tmp. get a tmp filename in collection instead???
184 $target_file_path = &util::get_tmp_filename($target_file_type);
185 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
186 }
187
188 # Generate and run the convert command
189 my $convert_cmd = $self->{'pdfbox_launch_cmd'};
190 $convert_cmd .= " -html" if ($target_file_type eq "html");
191 $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
192
193 if ($verbosity>2) {
194 print $outhandle "Convert command: $convert_cmd\n";
195 }
196
197 my $print_info = { 'message_prefix' => "PDFBox Conversion",
198 'message' => "Converting $source_file_no_path to: $target_file_type" };
199 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
200
201 my ($regenerated,$result,$had_error)
202 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
203 if ($had_error) {
204 return (0, $result,$target_file_path);
205 }
206 return (1, $result,$target_file_path);
207}
208
209sub convert_without_result {
210 my $self = shift(@_);
211
212 my $source_file_path = shift(@_);
213 my $target_file_type = shift(@_);
214 my $convert_options = shift(@_) || "";
215 my $convert_id = shift(@_) || "";
216
217 return $self->convert($source_file_path,$target_file_type,
218 $convert_options,$convert_id,"without_result");
219}
220
221sub clean_up_temporary_files {
222 my $self = shift(@_);
223
224 foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
225 if (-e $pbtmp_file_path) {
226 &util::rm($pbtmp_file_path);
227 }
228 }
229
230 $self->{'pbtmp_file_paths'} = ();
231}
232
233
234
2351;
Note: See TracBrowser for help on using the repository browser.