Context Navigation

PDFBoxConverter.pm@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

File size: 12.4 KB

Line
1	###########################################################################
2	#
3	# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2010 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	package PDFBoxConverter;
27
28	use BaseMediaConverter;
29
30	use strict;
31	no strict 'refs'; # allow filehandles to be variables and viceversa
32	no strict 'subs'; # allow barewords (eg STDERR) as function arguments
33
34	#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
35
36	use gsprintf 'gsprintf';
37	use FileUtils;
38
39	# these two variables mustn't be initialised here or they will get stuck
40	# at those values.
41	our $pdfbox_conversion_available;
42	our $no_pdfbox_conversion_reason;
43
44	BEGIN {
45	@PDFBoxConverter::ISA = ('BaseMediaConverter');
46
47	# Check that PDFBox is installed and available on the path
48	$pdfbox_conversion_available = 1;
49	$no_pdfbox_conversion_reason = "";
50
51	if (!defined $ENV{'GEXT_PDFBOX'}) {
52	$pdfbox_conversion_available = 0;
53	$no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
54	}
55	else {
56	my $gextpb_home = $ENV{'GEXT_PDFBOX'};
57	my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
58
59	if (!-e $pbajar) {
60	&gsprintf(STDERR,"**** Failed to find $pbajar\n");
61	$pdfbox_conversion_available = 0;
62	$no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
63	}
64	else {
65	# test to see if java is in path
66	# Need to run java -version instead of just java, since the %ERRORLEVEL% returned
67	# for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
68	# while %ERRORLEVEL% is 1 for JDK 1.7*
69	# If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
70	# installed, regardless of whether the JDK version is 1.6* or 1.7*.
71	my $java = &util::get_java_command();
72
73	my $cmd = "$java -version";
74	if ($ENV{'GSDLOS'} =~ /^windows/i) {
75	$cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
76	}
77	else {
78	# On Ubuntu, java >/dev/null 2>&1 works,
79	# but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
80	$cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
81	}
82
83	my $status = system($cmd);
84
85	if ($status != 0) {
86
87	my $error_message = "**** Testing for java\n";
88	$error_message .= "Failed to run: $cmd\n";
89	$error_message .= "Error variable: \|$!\| and status: $status\n";
90
91	&gsprintf(STDERR, "PDFBoxConverter: $error_message");
92
93	$pdfbox_conversion_available = 0;
94	$no_pdfbox_conversion_reason = "couldnotrunjava";
95	}
96	}
97	}
98
99	}
100
101	my $arguments = [ ];
102
103	my $options = { 'name' => "PDFBoxConverter",
104	'desc' => "{PDFBoxConverter.desc}",
105	'abstract' => "yes",
106	'inherits' => "yes",
107	'args' => $arguments };
108
109	sub new {
110	my ($class) = shift (@_);
111	my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
112	push(@$pluginlist, $class);
113
114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115	push(@{$hashArgOptLists->{"OptList"}},$options);
116
117
118	my $self = new BaseMediaConverter($pluginlist, $inputargs,
119	$hashArgOptLists, $auxilary);
120
121	if ($self->{'info_only'}) {
122	# don't worry about any options etc
123	return bless $self, $class;
124	}
125	if ($pdfbox_conversion_available) {
126	my $gextpb_home = $ENV{'GEXT_PDFBOX'};
127	my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
128	my $java = &util::get_java_command();
129	$self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
130	$self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
131	#$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
132	# Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
133	# AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output.
134	# Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
135	my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build");
136	my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);
137	$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
138	}
139	else {
140	$self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
141
142	my $outhandle = $self->{'outhandle'};
143	&gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
144	}
145
146	$self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
147
148	return bless $self, $class;
149
150	}
151
152	sub init {
153	my $self = shift(@_);
154	my ($verbosity, $outhandle, $failhandle) = @_;
155
156	$self->{'pbtmp_file_paths'} = ();
157	}
158
159	sub deinit {
160	my $self = shift(@_);
161
162	$self->clean_up_temporary_files();
163	}
164
165
166	sub convert {
167	my $self = shift(@_);
168	my ($source_file_full_path, $target_file_type) = @_;
169
170	return 0 unless $pdfbox_conversion_available;
171	# check the filename
172	return 0 if ( !-f $source_file_full_path);
173
174	# Although PDFBoxConverter inherits from AutoLoadConverters and therefore
175	# doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options
176	# in case in future PDFBoxConverter no longer inherits from AutoLoadConverters
177	# and ends up going through gsConvert.pl
178	$self->{'convert_options'} .= " -pdf_tool pdfbox";
179
180	my $img_output_mode = 0;
181
182	# the following line is necessary to avoid 'uninitialised variable' error
183	# messages concerning the converted_to member variable when PDFPlugin's
184	# use_sections option is checked.
185	# PDFBox plugin now processes use_sections option, when working with v1.5.0
186	# of the PDFBox jar file (which embeds each page in special <div> tags).
187	if ($target_file_type eq "html") {
188	$self->{'converted_to'} = "HTML";
189	} elsif ($target_file_type eq "jpg" \|\| $target_file_type eq "gif" \|\| $target_file_type eq "png") {
190	$self->{'converted_to'} = $target_file_type;
191	$img_output_mode = 1;
192	} else {
193	$self->{'converted_to'} = "text";
194	}
195
196	my $outhandle = $self->{'outhandle'};
197	my $verbosity = $self->{'verbosity'};
198
199	my $source_file_no_path = &File::Basename::basename($source_file_full_path);
200	# Determine the full name and path of the output file
201	my $target_file_path;
202	if ($self->{'enable_cache'}) {
203	$self->init_cache_for_file($source_file_full_path);
204	my $cache_dir = $self->{'cached_dir'};
205	my $file_root = $self->{'cached_file_root'};
206	#$file_root .= "_$convert_id" if ($convert_id ne "");
207
208	# append the output filetype suffix only for non-image output formats, since for
209	# images we can be outputting multiple image files per single PDF input file
210	my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
211
212	$target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
213	}
214	else {
215	# this is in gsdl/tmp. get a tmp filename in collection instead???
216	$target_file_path = &util::get_tmp_filename($target_file_type);
217
218	# for image files, remove the suffix, since we can have many output image files
219	# per input PDF (one img for each page of the PDF, for example)
220	if($img_output_mode) {
221	$target_file_path =~ s/\.[^.]*$//g;
222	if(!&FileUtils::directoryExists($target_file_path)) {
223	mkdir($target_file_path);
224	}
225
226	# once the item file for the imgs has been created, need to adjust target_file_path
227
228	# below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
229	# item file generated in it can be deleted in one go on clean_up
230	}
231
232	push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
233	}
234
235	# Generate and run the convert command
236	my $convert_cmd = "";
237
238	# want the filename without extension, because any images
239	# are to be generated with the same filename as the PDF
240	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
241
242	if($img_output_mode) { # converting to images
243	my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
244
245	$convert_cmd = $self->{'pdfbox_img_launch_cmd'};
246	$convert_cmd .= " -imageType $target_file_type";
247	$convert_cmd .= " -outputPrefix \"$output_prefix\"";
248	$convert_cmd .= " \"$source_file_full_path\"";
249
250	} else { # html or text
251
252	if ($target_file_type eq "html") {
253	$convert_cmd = $self->{'pdfbox_html_launch_cmd'};
254	$convert_cmd .= " -html" if ($target_file_type eq "html");
255	} else {
256	$convert_cmd = $self->{'pdfbox_txt_launch_cmd'};
257	}
258	$convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
259	}
260
261	if ($verbosity>2) {
262	&gsprintf($outhandle,"Convert command: $convert_cmd\n");
263	}
264
265	my $print_info = { 'message_prefix' => "PDFBox Conversion",
266	'message' => "Converting $source_file_no_path to: $target_file_type" };
267	# $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
268
269	my ($regenerated,$result,$had_error)
270	= $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
271
272	if($img_output_mode) {
273	# now the images have been generated, generate the "$target_file_path/tailname.item"
274	# item file for them, which is also the target_file_path that needs to be returned
275	$target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
276	#print STDERR "**** item file: $target_file_path\n";
277	}
278	elsif ($self->{'converted_to'} eq "text") {
279	# ensure html entities are doubly escaped for pdfbox to text conversion: & -> &amp;
280	# conversion to html does it automatically, but conversion to text doesn't
281	# and this results in illegal characters in doc.xml
282
283	my $fulltext = &FileUtils::readUTF8File($target_file_path);
284	if(defined $fulltext) {
285	#$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
286	$fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML
287	&FileUtils::writeUTF8File($target_file_path, \$fulltext);
288	} else {
289	print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n";
290	$had_error = 1;
291	}
292	}
293
294	if ($had_error) {
295	return (0, $result,$target_file_path);
296	}
297	return (1, $result,$target_file_path);
298	}
299
300	sub convert_without_result {
301	my $self = shift(@_);
302
303	my $source_file_path = shift(@_);
304	my $target_file_type = shift(@_);
305	my $convert_options = shift(@_) \|\| "";
306	my $convert_id = shift(@_) \|\| "";
307
308	return $self->convert($source_file_path,$target_file_type,
309	$convert_options,$convert_id,"without_result");
310	}
311
312	sub clean_up_temporary_files {
313	my $self = shift(@_);
314
315	foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
316	if (-d $pbtmp_file_path) {
317	#print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
318	&FileUtils::removeFilesRecursive($pbtmp_file_path);
319	}
320	elsif (-e $pbtmp_file_path) {
321	&FileUtils::removeFiles($pbtmp_file_path);
322	}
323	}
324
325	$self->{'pbtmp_file_paths'} = ();
326	}
327
328
329	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm@ 32273

Download in other formats: