Context Navigation

PDFBoxConverter.pm@ 32197

Last change on this file since 32197 was 32197, checked in by ak19, 6 years ago

Updates to the recent commit's modifications to do with pdfbox: new class has been renamed from GS_PDFToImagesAndText.java to org/greenstone/pdfbox/PDFBoxToImagesAndText.java and uses a GS package. This class file is no longer included in pdfbox-app.jar, but is just compiled against that. Added Apache v 2.0 licensing related files. PDFBoxConverter.pm now refers to the newly named Java class with the new org.greenstone.pdfbox package name. Updated the Readme to add instructions to do with compiling the new java file and its new folder/package structure, and information related to the Apache license. There's also the new java/build subfolder containing the precompiled class file (and Java pkg structure) for the new class. This new build folder with the new custom class, and the modified PDFBoxConverter.pm and the modified pdfbox-app.jar (without the custom class) are modifications to the pdfbox tarball/zip files too.

File size: 11.8 KB

Line
1	###########################################################################
2	#
3	# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2010 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	package PDFBoxConverter;
27
28	use BaseMediaConverter;
29
30	use strict;
31	no strict 'refs'; # allow filehandles to be variables and viceversa
32	no strict 'subs'; # allow barewords (eg STDERR) as function arguments
33
34	#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
35
36	use gsprintf 'gsprintf';
37	use FileUtils;
38
39	# these two variables mustn't be initialised here or they will get stuck
40	# at those values.
41	our $pdfbox_conversion_available;
42	our $no_pdfbox_conversion_reason;
43
44	BEGIN {
45	@PDFBoxConverter::ISA = ('BaseMediaConverter');
46
47	# Check that PDFBox is installed and available on the path
48	$pdfbox_conversion_available = 1;
49	$no_pdfbox_conversion_reason = "";
50
51	if (!defined $ENV{'GEXT_PDFBOX'}) {
52	$pdfbox_conversion_available = 0;
53	$no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
54	}
55	else {
56	my $gextpb_home = $ENV{'GEXT_PDFBOX'};
57	my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
58
59	if (!-e $pbajar) {
60	&gsprintf(STDERR,"**** Failed to find $pbajar\n");
61	$pdfbox_conversion_available = 0;
62	$no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
63	}
64	else {
65	# test to see if java is in path
66	# Need to run java -version instead of just java, since the %ERRORLEVEL% returned
67	# for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
68	# while %ERRORLEVEL% is 1 for JDK 1.7*
69	# If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
70	# installed, regardless of whether the JDK version is 1.6* or 1.7*.
71	my $java = &util::get_java_command();
72
73	my $cmd = "$java -version";
74	if ($ENV{'GSDLOS'} =~ /^windows/i) {
75	$cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
76	}
77	else {
78	# On Ubuntu, java >/dev/null 2>&1 works,
79	# but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
80	$cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
81	}
82
83	my $status = system($cmd);
84
85	if ($status != 0) {
86
87	my $error_message = "**** Testing for java\n";
88	$error_message .= "Failed to run: $cmd\n";
89	$error_message .= "Error variable: \|$!\| and status: $status\n";
90
91	&gsprintf(STDERR, "PDFBoxConverter: $error_message");
92
93	$pdfbox_conversion_available = 0;
94	$no_pdfbox_conversion_reason = "couldnotrunjava";
95	}
96	}
97	}
98
99	}
100
101	my $arguments = [ ];
102
103	my $options = { 'name' => "PDFBoxConverter",
104	'desc' => "{PDFBoxConverter.desc}",
105	'abstract' => "yes",
106	'inherits' => "yes",
107	'args' => $arguments };
108
109	sub new {
110	my ($class) = shift (@_);
111	my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
112	push(@$pluginlist, $class);
113
114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115	push(@{$hashArgOptLists->{"OptList"}},$options);
116
117
118	my $self = new BaseMediaConverter($pluginlist, $inputargs,
119	$hashArgOptLists, $auxilary);
120
121	if ($self->{'info_only'}) {
122	# don't worry about any options etc
123	return bless $self, $class;
124	}
125	if ($pdfbox_conversion_available) {
126	my $gextpb_home = $ENV{'GEXT_PDFBOX'};
127	my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
128	my $java = &util::get_java_command();
129	my $launch_cmd = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
130
131	$self->{'pdfbox_launch_cmd'} = $launch_cmd;
132	#$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
133	# Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
134	# AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output.
135	# Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
136	my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build");
137	my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);
138	$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
139	}
140	else {
141	$self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
142
143	my $outhandle = $self->{'outhandle'};
144	&gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
145	}
146
147	$self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
148
149	return bless $self, $class;
150
151	}
152
153	sub init {
154	my $self = shift(@_);
155	my ($verbosity, $outhandle, $failhandle) = @_;
156
157	$self->{'pbtmp_file_paths'} = ();
158	}
159
160	sub deinit {
161	my $self = shift(@_);
162
163	$self->clean_up_temporary_files();
164	}
165
166
167	sub convert {
168	my $self = shift(@_);
169	my ($source_file_full_path, $target_file_type) = @_;
170
171	return 0 unless $pdfbox_conversion_available;
172	# check the filename
173	return 0 if ( !-f $source_file_full_path);
174
175	my $img_output_mode = 0;
176
177	# the following line is necessary to avoid 'uninitialised variable' error
178	# messages concerning the converted_to member variable when PDFPlugin's
179	# use_sections option is checked.
180	# PDFBox plugin now processes use_sections option, when working with v1.5.0
181	# of the PDFBox jar file (which embeds each page in special <div> tags).
182	if ($target_file_type eq "html") {
183	$self->{'converted_to'} = "HTML";
184	} elsif ($target_file_type eq "jpg" \|\| $target_file_type eq "gif" \|\| $target_file_type eq "png") {
185	$self->{'converted_to'} = $target_file_type;
186	$img_output_mode = 1;
187	} else {
188	$self->{'converted_to'} = "text";
189	}
190
191	my $outhandle = $self->{'outhandle'};
192	my $verbosity = $self->{'verbosity'};
193
194	my $source_file_no_path = &File::Basename::basename($source_file_full_path);
195	# Determine the full name and path of the output file
196	my $target_file_path;
197	if ($self->{'enable_cache'}) {
198	$self->init_cache_for_file($source_file_full_path);
199	my $cache_dir = $self->{'cached_dir'};
200	my $file_root = $self->{'cached_file_root'};
201	#$file_root .= "_$convert_id" if ($convert_id ne "");
202
203	# append the output filetype suffix only for non-image output formats, since for
204	# images we can be outputting multiple image files per single PDF input file
205	my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
206
207	$target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
208	}
209	else {
210	# this is in gsdl/tmp. get a tmp filename in collection instead???
211	$target_file_path = &util::get_tmp_filename($target_file_type);
212
213	# for image files, remove the suffix, since we can have many output image files
214	# per input PDF (one img for each page of the PDF, for example)
215	if($img_output_mode) {
216	$target_file_path =~ s/\.[^.]*$//g;
217	if(!&FileUtils::directoryExists($target_file_path)) {
218	mkdir($target_file_path);
219	}
220
221	# once the item file for the imgs has been created, need to adjust target_file_path
222
223	# below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
224	# item file generated in it can be deleted in one go on clean_up
225	}
226
227	push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
228	}
229
230	# Generate and run the convert command
231	my $convert_cmd = "";
232
233	# want the filename without extension, because any images
234	# are to be generated with the same filename as the PDF
235	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
236
237	if($img_output_mode) { # converting to images
238	my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
239
240	$convert_cmd = $self->{'pdfbox_img_launch_cmd'};
241	$convert_cmd .= " -imageType $target_file_type";
242	$convert_cmd .= " -outputPrefix \"$output_prefix\"";
243	$convert_cmd .= " \"$source_file_full_path\"";
244
245	} else { # html or text
246	$convert_cmd = $self->{'pdfbox_launch_cmd'};
247	$convert_cmd .= " -html" if ($target_file_type eq "html");
248	$convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
249	}
250
251	if ($verbosity>2) {
252	&gsprintf($outhandle,"Convert command: $convert_cmd\n");
253	}
254
255	my $print_info = { 'message_prefix' => "PDFBox Conversion",
256	'message' => "Converting $source_file_no_path to: $target_file_type" };
257	# $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
258
259	my ($regenerated,$result,$had_error)
260	= $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
261
262	if($img_output_mode) {
263	# now the images have been generated, generate the "$target_file_path/tailname.item"
264	# item file for them, which is also the target_file_path that needs to be returned
265	$target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
266	#print STDERR "**** item file: $target_file_path\n";
267	}
268	elsif ($self->{'converted_to'} eq "text") {
269	# ensure html entities are doubly escaped for pdfbox to text conversion: & -> &amp;
270	# conversion to html does it automatically, but conversion to text doesn't
271	# and this results in illegal characters in doc.xml
272
273	my $fulltext = &FileUtils::readUTF8File($target_file_path);
274	if(defined $fulltext) {
275	#$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
276	$fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML
277	&FileUtils::writeUTF8File($target_file_path, \$fulltext);
278	} else {
279	print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n";
280	$had_error = 1;
281	}
282	}
283
284	if ($had_error) {
285	return (0, $result,$target_file_path);
286	}
287	return (1, $result,$target_file_path);
288	}
289
290	sub convert_without_result {
291	my $self = shift(@_);
292
293	my $source_file_path = shift(@_);
294	my $target_file_type = shift(@_);
295	my $convert_options = shift(@_) \|\| "";
296	my $convert_id = shift(@_) \|\| "";
297
298	return $self->convert($source_file_path,$target_file_type,
299	$convert_options,$convert_id,"without_result");
300	}
301
302	sub clean_up_temporary_files {
303	my $self = shift(@_);
304
305	foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
306	if (-d $pbtmp_file_path) {
307	#print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
308	&FileUtils::removeFilesRecursive($pbtmp_file_path);
309	}
310	elsif (-e $pbtmp_file_path) {
311	&FileUtils::removeFiles($pbtmp_file_path);
312	}
313	}
314
315	$self->{'pbtmp_file_paths'} = ();
316	}
317
318
319	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 32197

Download in other formats: