source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 23666

Last change on this file since 23666 was 23666, checked in by ak19, 13 years ago
  1. PDFBoxConverter.pm changed to get PDFBox to work when there's spaces in the path. 2. Needed to commit previous changes to setup.bat to the zip files as well.
File size: 6.6 KB
Line 
1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use gsprintf 'gsprintf';
34
35# these two variables mustn't be initialised here or they will get stuck
36# at those values.
37our $pdfbox_conversion_available;
38our $no_pdfbox_conversion_reason;
39
40BEGIN {
41 @PDFBoxConverter::ISA = ('BaseMediaConverter');
42
43 # Check that PDFBox is installed and available on the path
44 $pdfbox_conversion_available = 1;
45 $no_pdfbox_conversion_reason = "";
46
47 if (!defined $ENV{'GEXT_PDFBOX'}) {
48 $pdfbox_conversion_available = 0;
49 $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
50 }
51 else {
52 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
53 my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
54
55 if (!-e $pbajar) {
56 print STDERR "Failed to find $pbajar\n";
57 $pdfbox_conversion_available = 0;
58 $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
59 }
60 else {
61 # test to see if java is in path
62 my $cmd = "java 2>&1";
63 if ($ENV{'GSDLOS'} =~ /^windows/i) {
64 $cmd .= " >nul";
65 }
66 else {
67 $cmd .= " >/dev/null &";
68 }
69
70 my $status = system($cmd);
71
72 if ($status != 0) {
73 print STDERR "Testing for java\n";
74 print STDERR "Failed to run: $cmd\n";
75 print STDERR "$!\n";
76 $pdfbox_conversion_available = 0;
77 $no_pdfbox_conversion_reason = "couldnotrunjava";
78 }
79 }
80 }
81
82}
83
84my $arguments = [ ];
85
86my $options = { 'name' => "PDFBoxConverter",
87 'desc' => "{PDFBoxConverter.desc}",
88 'abstract' => "yes",
89 'inherits' => "yes",
90 'args' => $arguments };
91
92sub new {
93 my ($class) = shift (@_);
94 my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
95 push(@$pluginlist, $class);
96
97 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98 push(@{$hashArgOptLists->{"OptList"}},$options);
99
100
101 my $self = new BaseMediaConverter($pluginlist, $inputargs,
102 $hashArgOptLists, $auxilary);
103
104 if ($self->{'info_only'}) {
105 # don't worry about any options etc
106 return bless $self, $class;
107 }
108 if ($pdfbox_conversion_available) {
109 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
110 my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
111 my $launch_cmd = "java -cp \"$pbajar\" org.apache.pdfbox.ExtractText";
112
113 $self->{'pdfbox_launch_cmd'} = $launch_cmd;
114 }
115 else {
116 $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
117
118 my $outhandle = $self->{'outhandle'};
119 &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
120 }
121
122 $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
123
124 return bless $self, $class;
125
126}
127
128sub init {
129 my $self = shift(@_);
130 my ($verbosity, $outhandle, $failhandle) = @_;
131
132 $self->{'pbtmp_file_paths'} = ();
133}
134
135sub deinit {
136 my $self = shift(@_);
137
138 $self->clean_up_temporary_files();
139}
140
141
142sub convert {
143 my $self = shift(@_);
144 my ($source_file_full_path, $target_file_type) = @_;
145
146 return 0 unless $pdfbox_conversion_available;
147 # check the filename
148 return 0 if ( !-f $source_file_full_path);
149
150 my $outhandle = $self->{'outhandle'};
151 my $verbosity = $self->{'verbosity'};
152
153 my $source_file_no_path = &File::Basename::basename($source_file_full_path);
154 # Determine the full name and path of the output file
155 my $target_file_path;
156 if ($self->{'enable_cache'}) {
157 $self->init_cache_for_file($source_file_full_path);
158 my $cache_dir = $self->{'cached_dir'};
159 my $file_root = $self->{'cached_file_root'};
160 #$file_root .= "_$convert_id" if ($convert_id ne "");
161 my $target_file = "$file_root.$target_file_type";
162 $target_file_path = &util::filename_cat($cache_dir,$target_file);
163 }
164 else {
165 # this is in gsdl/tmp. get a tmp filename in collection instead???
166 $target_file_path = &util::get_tmp_filename($target_file_type);
167 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
168 }
169
170 # Generate and run the convert command
171 my $convert_cmd = $self->{'pdfbox_launch_cmd'};
172 $convert_cmd .= " -html" if ($target_file_type eq "html");
173 $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
174
175 if ($verbosity>2) {
176 print $outhandle "Convert command: $convert_cmd\n";
177 }
178
179 my $print_info = { 'message_prefix' => "PDFBox Conversion",
180 'message' => "Converting $source_file_no_path to: $target_file_type" };
181 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
182
183 my ($regenerated,$result,$had_error)
184 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
185 if ($had_error) {
186 return (0, $result,$target_file_path);
187 }
188 return (1, $result,$target_file_path);
189}
190
191sub convert_without_result {
192 my $self = shift(@_);
193
194 my $source_file_path = shift(@_);
195 my $target_file_type = shift(@_);
196 my $convert_options = shift(@_) || "";
197 my $convert_id = shift(@_) || "";
198
199 return $self->convert($source_file_path,$target_file_type,
200 $convert_options,$convert_id,"without_result");
201}
202
203sub clean_up_temporary_files {
204 my $self = shift(@_);
205
206 foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
207 if (-e $pbtmp_file_path) {
208 &util::rm($pbtmp_file_path);
209 }
210 }
211
212 $self->{'pbtmp_file_paths'} = ();
213}
214
215
216
2171;
Note: See TracBrowser for help on using the repository browser.