Context Navigation

PDFBoxConverter.pm@ 32089

Last change on this file since 32089 was 32089, checked in by ak19, 6 years ago

Attempted fix by Kathy and me for Diego's problem of PDFBox's handling of a PDF. When it was set to convert_to_html, it built fine, but convert_to_text produced something that was invalid XML in doc.XML and build failed. Diego reasoned correctly that building ought to succeed in both cases if it succeeded in one case. Kathy found the correct fix for escaping the ampersand character (it wasn't & to &amp; that I'd attempted, nor did using HTML::Entities' encode work either). 2. The fix needed to read and write files, so introducing readUTF8File() and writeUTF8File() into FileUtils.pm for reusability. Need to still contact John Thompson to ask him if and how these functions need to be modified to support parallel processing, for which FileUtils was written.

File size: 11.0 KB

Line
1	###########################################################################
2	#
3	# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2010 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	package PDFBoxConverter;
27
28	use BaseMediaConverter;
29
30	use strict;
31	no strict 'refs'; # allow filehandles to be variables and viceversa
32	no strict 'subs'; # allow barewords (eg STDERR) as function arguments
33
34	#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
35
36	use gsprintf 'gsprintf';
37	use FileUtils;
38
39	# these two variables mustn't be initialised here or they will get stuck
40	# at those values.
41	our $pdfbox_conversion_available;
42	our $no_pdfbox_conversion_reason;
43
44	BEGIN {
45	@PDFBoxConverter::ISA = ('BaseMediaConverter');
46
47	# Check that PDFBox is installed and available on the path
48	$pdfbox_conversion_available = 1;
49	$no_pdfbox_conversion_reason = "";
50
51	if (!defined $ENV{'GEXT_PDFBOX'}) {
52	$pdfbox_conversion_available = 0;
53	$no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
54	}
55	else {
56	my $gextpb_home = $ENV{'GEXT_PDFBOX'};
57	my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
58
59	if (!-e $pbajar) {
60	&gsprintf(STDERR,"**** Failed to find $pbajar\n");
61	$pdfbox_conversion_available = 0;
62	$no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
63	}
64	else {
65	# test to see if java is in path
66	# Need to run java -version instead of just java, since the %ERRORLEVEL% returned
67	# for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
68	# while %ERRORLEVEL% is 1 for JDK 1.7*
69	# If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
70	# installed, regardless of whether the JDK version is 1.6* or 1.7*.
71	my $java = &util::get_java_command();
72
73	my $cmd = "$java -version";
74	if ($ENV{'GSDLOS'} =~ /^windows/i) {
75	$cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
76	}
77	else {
78	# On Ubuntu, java >/dev/null 2>&1 works,
79	# but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
80	$cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
81	}
82
83	my $status = system($cmd);
84
85	if ($status != 0) {
86
87	my $error_message = "**** Testing for java\n";
88	$error_message .= "Failed to run: $cmd\n";
89	$error_message .= "Error variable: \|$!\| and status: $status\n";
90
91	&gsprintf(STDERR, "PDFBoxConverter: $error_message");
92
93	$pdfbox_conversion_available = 0;
94	$no_pdfbox_conversion_reason = "couldnotrunjava";
95	}
96	}
97	}
98
99	}
100
101	my $arguments = [ ];
102
103	my $options = { 'name' => "PDFBoxConverter",
104	'desc' => "{PDFBoxConverter.desc}",
105	'abstract' => "yes",
106	'inherits' => "yes",
107	'args' => $arguments };
108
109	sub new {
110	my ($class) = shift (@_);
111	my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
112	push(@$pluginlist, $class);
113
114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115	push(@{$hashArgOptLists->{"OptList"}},$options);
116
117
118	my $self = new BaseMediaConverter($pluginlist, $inputargs,
119	$hashArgOptLists, $auxilary);
120
121	if ($self->{'info_only'}) {
122	# don't worry about any options etc
123	return bless $self, $class;
124	}
125	if ($pdfbox_conversion_available) {
126	my $gextpb_home = $ENV{'GEXT_PDFBOX'};
127	my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
128	my $java = &util::get_java_command();
129	my $launch_cmd = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText";
130
131	$self->{'pdfbox_launch_cmd'} = $launch_cmd;
132	$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.PDFToImage"; # cmd for converting pages to images (gif, jpg, png)
133	}
134	else {
135	$self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
136
137	my $outhandle = $self->{'outhandle'};
138	&gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
139	}
140
141	$self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
142
143	return bless $self, $class;
144
145	}
146
147	sub init {
148	my $self = shift(@_);
149	my ($verbosity, $outhandle, $failhandle) = @_;
150
151	$self->{'pbtmp_file_paths'} = ();
152	}
153
154	sub deinit {
155	my $self = shift(@_);
156
157	$self->clean_up_temporary_files();
158	}
159
160
161	sub convert {
162	my $self = shift(@_);
163	my ($source_file_full_path, $target_file_type) = @_;
164
165	return 0 unless $pdfbox_conversion_available;
166	# check the filename
167	return 0 if ( !-f $source_file_full_path);
168
169	my $img_output_mode = 0;
170
171	# the following line is necessary to avoid 'uninitialised variable' error
172	# messages concerning the converted_to member variable when PDFPlugin's
173	# use_sections option is checked.
174	# PDFBox plugin now processes use_sections option, when working with v1.5.0
175	# of the PDFBox jar file (which embeds each page in special <div> tags).
176	if ($target_file_type eq "html") {
177	$self->{'converted_to'} = "HTML";
178	} elsif ($target_file_type eq "jpg" \|\| $target_file_type eq "gif" \|\| $target_file_type eq "png") {
179	$self->{'converted_to'} = $target_file_type;
180	$img_output_mode = 1;
181	} else {
182	$self->{'converted_to'} = "text";
183	}
184
185	my $outhandle = $self->{'outhandle'};
186	my $verbosity = $self->{'verbosity'};
187
188	my $source_file_no_path = &File::Basename::basename($source_file_full_path);
189	# Determine the full name and path of the output file
190	my $target_file_path;
191	if ($self->{'enable_cache'}) {
192	$self->init_cache_for_file($source_file_full_path);
193	my $cache_dir = $self->{'cached_dir'};
194	my $file_root = $self->{'cached_file_root'};
195	#$file_root .= "_$convert_id" if ($convert_id ne "");
196
197	# append the output filetype suffix only for non-image output formats, since for
198	# images we can be outputting multiple image files per single PDF input file
199	my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
200
201	$target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
202	}
203	else {
204	# this is in gsdl/tmp. get a tmp filename in collection instead???
205	$target_file_path = &util::get_tmp_filename($target_file_type);
206
207	# for image files, remove the suffix, since we can have many output image files
208	# per input PDF (one img for each page of the PDF, for example)
209	if($img_output_mode) {
210	$target_file_path =~ s/\.[^.]*$//g;
211	if(!&FileUtils::directoryExists($target_file_path)) {
212	mkdir($target_file_path);
213	}
214
215	# once the item file for the imgs has been created, need to adjust target_file_path
216
217	# below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
218	# item file generated in it can be deleted in one go on clean_up
219	}
220
221	push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
222	}
223
224	# Generate and run the convert command
225	my $convert_cmd = "";
226
227	# want the filename without extension, because any images
228	# are to be generated with the same filename as the PDF
229	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
230
231	if($img_output_mode) { # converting to images
232	my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
233
234	$convert_cmd = $self->{'pdfbox_img_launch_cmd'};
235	$convert_cmd .= " -imageType $target_file_type";
236	$convert_cmd .= " -outputPrefix \"$output_prefix\"";
237	$convert_cmd .= " \"$source_file_full_path\"";
238
239	} else { # html or text
240	$convert_cmd = $self->{'pdfbox_launch_cmd'};
241	$convert_cmd .= " -html" if ($target_file_type eq "html");
242	$convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
243	}
244
245	if ($verbosity>2) {
246	&gsprintf($outhandle,"Convert command: $convert_cmd\n");
247	}
248
249	my $print_info = { 'message_prefix' => "PDFBox Conversion",
250	'message' => "Converting $source_file_no_path to: $target_file_type" };
251	# $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
252
253	my ($regenerated,$result,$had_error)
254	= $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
255
256	if($img_output_mode) {
257	# now the images have been generated, generate the "$target_file_path/tailname.item"
258	# item file for them, which is also the target_file_path that needs to be returned
259	$target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
260	#print STDERR "**** item file: $target_file_path\n";
261	}
262	elsif ($self->{'converted_to'} eq "text") {
263	# ensure html entities are doubly escaped for pdfbox to text conversion: & -> &amp;
264	# conversion to html does it automatically, but conversion to text doesn't
265	# and this results in illegal characters in doc.xml
266
267	my $fulltext = &FileUtils::readUTF8File($target_file_path);
268	#$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
269	$fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML
270	&FileUtils::writeUTF8File($target_file_path, \$fulltext);
271	}
272
273	if ($had_error) {
274	return (0, $result,$target_file_path);
275	}
276	return (1, $result,$target_file_path);
277	}
278
279	sub convert_without_result {
280	my $self = shift(@_);
281
282	my $source_file_path = shift(@_);
283	my $target_file_type = shift(@_);
284	my $convert_options = shift(@_) \|\| "";
285	my $convert_id = shift(@_) \|\| "";
286
287	return $self->convert($source_file_path,$target_file_type,
288	$convert_options,$convert_id,"without_result");
289	}
290
291	sub clean_up_temporary_files {
292	my $self = shift(@_);
293
294	foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
295	if (-d $pbtmp_file_path) {
296	#print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
297	&FileUtils::removeFilesRecursive($pbtmp_file_path);
298	}
299	elsif (-e $pbtmp_file_path) {
300	&FileUtils::removeFiles($pbtmp_file_path);
301	}
302	}
303
304	$self->{'pbtmp_file_paths'} = ();
305	}
306
307
308	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 32089

Download in other formats: