root/gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm @ 32198

Revision 32198, 12.0 KB (checked in by ak19, 16 months ago)

Fixing an issue with PDFBox to txt conversion, whereby PDFBox to txt conversion would produce text that's actually HTML in pre tags. Not sure if this issue was introduced during the recent commit upgrading the pdfbox version from 1.8.2 to 2.0.9, or whether it already existed. But fixing it now so that text conversion with PDFBox actually produces txt, while html conversion still produces the old cheap html without preserving any images in the src pdf.

Line 
1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32no strict 'subs'; # allow barewords (eg STDERR) as function arguments
33
34#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
35
36use gsprintf 'gsprintf';
37use FileUtils;
38
39# these two variables mustn't be initialised here or they will get stuck
40# at those values.
41our $pdfbox_conversion_available;
42our $no_pdfbox_conversion_reason;
43
44BEGIN {
45    @PDFBoxConverter::ISA = ('BaseMediaConverter');
46
47    # Check that PDFBox is installed and available on the path
48    $pdfbox_conversion_available = 1;
49    $no_pdfbox_conversion_reason = "";
50   
51    if (!defined $ENV{'GEXT_PDFBOX'}) {
52    $pdfbox_conversion_available = 0;
53    $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
54    }
55    else {
56    my $gextpb_home = $ENV{'GEXT_PDFBOX'};
57    my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
58
59    if (!-e $pbajar) {
60        &gsprintf(STDERR,"**** Failed to find $pbajar\n");
61        $pdfbox_conversion_available = 0;
62        $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
63    }
64    else {
65        # test to see if java is in path
66        # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
67        # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
68        # while %ERRORLEVEL% is 1 for JDK 1.7*
69        # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
70        # installed, regardless of whether the JDK version is 1.6* or 1.7*.
71        my $java = &util::get_java_command();
72           
73        my $cmd = "$java -version";
74        if ($ENV{'GSDLOS'} =~ /^windows/i) {
75        $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
76        }
77        else {
78        # On Ubuntu, java >/dev/null 2>&1 works,
79        # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
80        $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
81        }
82
83        my $status = system($cmd);
84
85        if ($status != 0) {
86           
87        my $error_message =  "**** Testing for java\n";
88        $error_message .= "Failed to run: $cmd\n";
89        $error_message .=  "Error variable: |$!| and status: $status\n";
90
91        &gsprintf(STDERR, "PDFBoxConverter: $error_message");
92
93        $pdfbox_conversion_available = 0;
94        $no_pdfbox_conversion_reason = "couldnotrunjava";
95        }
96    }
97    }
98
99}
100
101my $arguments = [ ];
102
103my $options = { 'name' => "PDFBoxConverter",
104        'desc' => "{PDFBoxConverter.desc}",
105        'abstract' => "yes",
106        'inherits' => "yes",
107        'args' => $arguments };
108
109sub new {
110    my ($class) = shift (@_);
111    my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
112    push(@$pluginlist, $class);
113
114    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115    push(@{$hashArgOptLists->{"OptList"}},$options);
116
117
118    my $self = new BaseMediaConverter($pluginlist, $inputargs,
119                      $hashArgOptLists, $auxilary);
120
121    if ($self->{'info_only'}) {
122    # don't worry about any options etc
123    return bless $self, $class;
124    }
125    if ($pdfbox_conversion_available) {
126    my $gextpb_home = $ENV{'GEXT_PDFBOX'};
127    my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
128    my $java = &util::get_java_command();
129    $self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
130    $self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
131    #$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
132    # Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
133    # AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output.
134    # Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
135    my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build");
136    my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);
137    $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
138    }
139    else {       
140    $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
141
142    my $outhandle = $self->{'outhandle'};
143    &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
144    } 
145
146    $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
147   
148    return bless $self, $class;
149
150}
151
152sub init {
153    my $self = shift(@_);
154    my ($verbosity, $outhandle, $failhandle) = @_;
155
156    $self->{'pbtmp_file_paths'} = ();
157}
158
159sub deinit {
160    my $self = shift(@_);
161
162    $self->clean_up_temporary_files();
163}
164
165
166sub convert {
167    my $self = shift(@_);
168    my ($source_file_full_path, $target_file_type) = @_;
169
170    return 0 unless $pdfbox_conversion_available;
171    # check the filename
172    return 0 if ( !-f $source_file_full_path);
173
174    my $img_output_mode = 0;
175
176    # the following line is necessary to avoid 'uninitialised variable' error
177    # messages concerning the converted_to member variable when PDFPlugin's
178    # use_sections option is checked.
179    # PDFBox plugin now processes use_sections option, when working with v1.5.0
180    # of the PDFBox jar file (which embeds each page in special <div> tags).
181    if ($target_file_type eq "html") {
182    $self->{'converted_to'} = "HTML";
183    } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
184    $self->{'converted_to'} = $target_file_type;   
185    $img_output_mode = 1;
186    } else {
187    $self->{'converted_to'} = "text";
188    }
189
190    my $outhandle = $self->{'outhandle'};
191    my $verbosity = $self->{'verbosity'};
192
193    my $source_file_no_path = &File::Basename::basename($source_file_full_path);
194    # Determine the full name and path of the output file
195    my $target_file_path;
196    if ($self->{'enable_cache'}) {
197    $self->init_cache_for_file($source_file_full_path);
198    my $cache_dir = $self->{'cached_dir'};
199    my $file_root = $self->{'cached_file_root'};
200    #$file_root .= "_$convert_id" if ($convert_id ne "");
201
202    # append the output filetype suffix only for non-image output formats, since for
203    # images we can be outputting multiple image files per single PDF input file
204    my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
205
206    $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
207    }
208    else {
209    # this is in gsdl/tmp. get a tmp filename in collection instead???
210    $target_file_path = &util::get_tmp_filename($target_file_type);
211
212    # for image files, remove the suffix, since we can have many output image files
213    # per input PDF (one img for each page of the PDF, for example)
214    if($img_output_mode) {
215        $target_file_path =~ s/\.[^.]*$//g;
216        if(!&FileUtils::directoryExists($target_file_path)) {       
217        mkdir($target_file_path);
218        }
219       
220        # once the item file for the imgs has been created, need to adjust target_file_path
221
222        # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
223        # item file generated in it can be deleted in one go on clean_up
224    }
225
226    push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
227    }
228
229    # Generate and run the convert command
230    my $convert_cmd = "";
231
232    # want the filename without extension, because any images
233    # are to be generated with the same filename as the PDF
234    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
235
236    if($img_output_mode) { # converting to images
237    my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
238   
239    $convert_cmd = $self->{'pdfbox_img_launch_cmd'};
240    $convert_cmd .= " -imageType $target_file_type";
241    $convert_cmd .= " -outputPrefix \"$output_prefix\"";
242    $convert_cmd .= " \"$source_file_full_path\"";
243   
244    } else { # html or text
245   
246    if ($target_file_type eq "html") {
247        $convert_cmd = $self->{'pdfbox_html_launch_cmd'};
248        $convert_cmd .= " -html" if ($target_file_type eq "html");
249    } else {
250        $convert_cmd = $self->{'pdfbox_txt_launch_cmd'};
251    }
252    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
253    }
254
255    if ($verbosity>2) {
256    &gsprintf($outhandle,"Convert command: $convert_cmd\n");
257    }
258
259    my $print_info = { 'message_prefix' => "PDFBox Conversion",
260               'message' => "Converting $source_file_no_path to: $target_file_type" };
261    # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
262
263    my ($regenerated,$result,$had_error)
264    = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
265
266    if($img_output_mode) {
267    # now the images have been generated, generate the "$target_file_path/tailname.item"
268    # item file for them, which is also the target_file_path that needs to be returned
269    $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
270    #print STDERR "**** item file: $target_file_path\n";
271    }
272    elsif ($self->{'converted_to'} eq "text") {
273    # ensure html entities are doubly escaped for pdfbox to text conversion: &amp; -> &amp;amp;
274    # conversion to html does it automatically, but conversion to text doesn't
275    # and this results in illegal characters in doc.xml
276
277    my $fulltext = &FileUtils::readUTF8File($target_file_path);
278    if(defined $fulltext) {
279        #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
280        $fulltext =~ s@&@&amp;@sg; # Kathy's fix to ensure doc contents don't break XML
281        &FileUtils::writeUTF8File($target_file_path, \$fulltext);
282    } else {
283        print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n";
284        $had_error = 1;
285    }
286    }
287
288    if ($had_error) {
289    return (0, $result,$target_file_path);
290    }
291    return (1, $result,$target_file_path);
292}
293
294sub convert_without_result {
295    my $self = shift(@_);
296
297    my $source_file_path = shift(@_);
298    my $target_file_type = shift(@_);
299    my $convert_options  = shift(@_) || "";
300    my $convert_id       = shift(@_) || "";
301
302    return $self->convert($source_file_path,$target_file_type,
303              $convert_options,$convert_id,"without_result");
304}
305
306sub clean_up_temporary_files {
307    my $self = shift(@_);
308
309    foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
310    if (-d $pbtmp_file_path) {
311        #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
312        &FileUtils::removeFilesRecursive($pbtmp_file_path);
313    }
314    elsif (-e $pbtmp_file_path) {
315        &FileUtils::removeFiles($pbtmp_file_path);
316    }
317    }
318
319    $self->{'pbtmp_file_paths'} = ();
320}
321
322
3231;
Note: See TracBrowser for help on using the browser.