source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

File size: 12.4 KB
Line 
1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32no strict 'subs'; # allow barewords (eg STDERR) as function arguments
33
34#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
35
36use gsprintf 'gsprintf';
37use FileUtils;
38
39# these two variables mustn't be initialised here or they will get stuck
40# at those values.
41our $pdfbox_conversion_available;
42our $no_pdfbox_conversion_reason;
43
44BEGIN {
45 @PDFBoxConverter::ISA = ('BaseMediaConverter');
46
47 # Check that PDFBox is installed and available on the path
48 $pdfbox_conversion_available = 1;
49 $no_pdfbox_conversion_reason = "";
50
51 if (!defined $ENV{'GEXT_PDFBOX'}) {
52 $pdfbox_conversion_available = 0;
53 $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
54 }
55 else {
56 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
57 my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
58
59 if (!-e $pbajar) {
60 &gsprintf(STDERR,"**** Failed to find $pbajar\n");
61 $pdfbox_conversion_available = 0;
62 $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
63 }
64 else {
65 # test to see if java is in path
66 # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
67 # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
68 # while %ERRORLEVEL% is 1 for JDK 1.7*
69 # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
70 # installed, regardless of whether the JDK version is 1.6* or 1.7*.
71 my $java = &util::get_java_command();
72
73 my $cmd = "$java -version";
74 if ($ENV{'GSDLOS'} =~ /^windows/i) {
75 $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
76 }
77 else {
78 # On Ubuntu, java >/dev/null 2>&1 works,
79 # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
80 $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
81 }
82
83 my $status = system($cmd);
84
85 if ($status != 0) {
86
87 my $error_message = "**** Testing for java\n";
88 $error_message .= "Failed to run: $cmd\n";
89 $error_message .= "Error variable: |$!| and status: $status\n";
90
91 &gsprintf(STDERR, "PDFBoxConverter: $error_message");
92
93 $pdfbox_conversion_available = 0;
94 $no_pdfbox_conversion_reason = "couldnotrunjava";
95 }
96 }
97 }
98
99}
100
101my $arguments = [ ];
102
103my $options = { 'name' => "PDFBoxConverter",
104 'desc' => "{PDFBoxConverter.desc}",
105 'abstract' => "yes",
106 'inherits' => "yes",
107 'args' => $arguments };
108
109sub new {
110 my ($class) = shift (@_);
111 my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
112 push(@$pluginlist, $class);
113
114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
116
117
118 my $self = new BaseMediaConverter($pluginlist, $inputargs,
119 $hashArgOptLists, $auxilary);
120
121 if ($self->{'info_only'}) {
122 # don't worry about any options etc
123 return bless $self, $class;
124 }
125 if ($pdfbox_conversion_available) {
126 my $gextpb_home = $ENV{'GEXT_PDFBOX'};
127 my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
128 my $java = &util::get_java_command();
129 $self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
130 $self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
131 #$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
132 # Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
133 # AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output.
134 # Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
135 my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build");
136 my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);
137 $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
138 }
139 else {
140 $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
141
142 my $outhandle = $self->{'outhandle'};
143 &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
144 }
145
146 $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
147
148 return bless $self, $class;
149
150}
151
152sub init {
153 my $self = shift(@_);
154 my ($verbosity, $outhandle, $failhandle) = @_;
155
156 $self->{'pbtmp_file_paths'} = ();
157}
158
159sub deinit {
160 my $self = shift(@_);
161
162 $self->clean_up_temporary_files();
163}
164
165
166sub convert {
167 my $self = shift(@_);
168 my ($source_file_full_path, $target_file_type) = @_;
169
170 return 0 unless $pdfbox_conversion_available;
171 # check the filename
172 return 0 if ( !-f $source_file_full_path);
173
174 # Although PDFBoxConverter inherits from AutoLoadConverters and therefore
175 # doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options
176 # in case in future PDFBoxConverter no longer inherits from AutoLoadConverters
177 # and ends up going through gsConvert.pl
178 $self->{'convert_options'} .= " -pdf_tool pdfbox";
179
180 my $img_output_mode = 0;
181
182 # the following line is necessary to avoid 'uninitialised variable' error
183 # messages concerning the converted_to member variable when PDFPlugin's
184 # use_sections option is checked.
185 # PDFBox plugin now processes use_sections option, when working with v1.5.0
186 # of the PDFBox jar file (which embeds each page in special <div> tags).
187 if ($target_file_type eq "html") {
188 $self->{'converted_to'} = "HTML";
189 } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
190 $self->{'converted_to'} = $target_file_type;
191 $img_output_mode = 1;
192 } else {
193 $self->{'converted_to'} = "text";
194 }
195
196 my $outhandle = $self->{'outhandle'};
197 my $verbosity = $self->{'verbosity'};
198
199 my $source_file_no_path = &File::Basename::basename($source_file_full_path);
200 # Determine the full name and path of the output file
201 my $target_file_path;
202 if ($self->{'enable_cache'}) {
203 $self->init_cache_for_file($source_file_full_path);
204 my $cache_dir = $self->{'cached_dir'};
205 my $file_root = $self->{'cached_file_root'};
206 #$file_root .= "_$convert_id" if ($convert_id ne "");
207
208 # append the output filetype suffix only for non-image output formats, since for
209 # images we can be outputting multiple image files per single PDF input file
210 my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
211
212 $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
213 }
214 else {
215 # this is in gsdl/tmp. get a tmp filename in collection instead???
216 $target_file_path = &util::get_tmp_filename($target_file_type);
217
218 # for image files, remove the suffix, since we can have many output image files
219 # per input PDF (one img for each page of the PDF, for example)
220 if($img_output_mode) {
221 $target_file_path =~ s/\.[^.]*$//g;
222 if(!&FileUtils::directoryExists($target_file_path)) {
223 mkdir($target_file_path);
224 }
225
226 # once the item file for the imgs has been created, need to adjust target_file_path
227
228 # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
229 # item file generated in it can be deleted in one go on clean_up
230 }
231
232 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
233 }
234
235 # Generate and run the convert command
236 my $convert_cmd = "";
237
238 # want the filename without extension, because any images
239 # are to be generated with the same filename as the PDF
240 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
241
242 if($img_output_mode) { # converting to images
243 my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
244
245 $convert_cmd = $self->{'pdfbox_img_launch_cmd'};
246 $convert_cmd .= " -imageType $target_file_type";
247 $convert_cmd .= " -outputPrefix \"$output_prefix\"";
248 $convert_cmd .= " \"$source_file_full_path\"";
249
250 } else { # html or text
251
252 if ($target_file_type eq "html") {
253 $convert_cmd = $self->{'pdfbox_html_launch_cmd'};
254 $convert_cmd .= " -html" if ($target_file_type eq "html");
255 } else {
256 $convert_cmd = $self->{'pdfbox_txt_launch_cmd'};
257 }
258 $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
259 }
260
261 if ($verbosity>2) {
262 &gsprintf($outhandle,"Convert command: $convert_cmd\n");
263 }
264
265 my $print_info = { 'message_prefix' => "PDFBox Conversion",
266 'message' => "Converting $source_file_no_path to: $target_file_type" };
267 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
268
269 my ($regenerated,$result,$had_error)
270 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
271
272 if($img_output_mode) {
273 # now the images have been generated, generate the "$target_file_path/tailname.item"
274 # item file for them, which is also the target_file_path that needs to be returned
275 $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
276 #print STDERR "**** item file: $target_file_path\n";
277 }
278 elsif ($self->{'converted_to'} eq "text") {
279 # ensure html entities are doubly escaped for pdfbox to text conversion: &amp; -> &amp;amp;
280 # conversion to html does it automatically, but conversion to text doesn't
281 # and this results in illegal characters in doc.xml
282
283 my $fulltext = &FileUtils::readUTF8File($target_file_path);
284 if(defined $fulltext) {
285 #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
286 $fulltext =~ s@&@&amp;@sg; # Kathy's fix to ensure doc contents don't break XML
287 &FileUtils::writeUTF8File($target_file_path, \$fulltext);
288 } else {
289 print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n";
290 $had_error = 1;
291 }
292 }
293
294 if ($had_error) {
295 return (0, $result,$target_file_path);
296 }
297 return (1, $result,$target_file_path);
298}
299
300sub convert_without_result {
301 my $self = shift(@_);
302
303 my $source_file_path = shift(@_);
304 my $target_file_type = shift(@_);
305 my $convert_options = shift(@_) || "";
306 my $convert_id = shift(@_) || "";
307
308 return $self->convert($source_file_path,$target_file_type,
309 $convert_options,$convert_id,"without_result");
310}
311
312sub clean_up_temporary_files {
313 my $self = shift(@_);
314
315 foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
316 if (-d $pbtmp_file_path) {
317 #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
318 &FileUtils::removeFilesRecursive($pbtmp_file_path);
319 }
320 elsif (-e $pbtmp_file_path) {
321 &FileUtils::removeFiles($pbtmp_file_path);
322 }
323 }
324
325 $self->{'pbtmp_file_paths'} = ();
326}
327
328
3291;
Note: See TracBrowser for help on using the repository browser.