root/gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm @ 24762

Revision 24762, 7.3 KB (checked in by ak19, 8 years ago)

When launching the java command with 2>&1, still need the extra & at the end which was removed upon last commit: it is necessary on Linux CentOS, else launching GLI with the pdfbox extension fails with an inability to parse the plugins list.

Line 
1###########################################################################
2#
3# PDFBoxConverter - helper plugin that does pdf document conversion with PDFBox
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFBoxConverter;
27
28use BaseMediaConverter;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use gsprintf 'gsprintf';
34
35# these two variables mustn't be initialised here or they will get stuck
36# at those values.
37our $pdfbox_conversion_available;
38our $no_pdfbox_conversion_reason;
39
40BEGIN {
41    @PDFBoxConverter::ISA = ('BaseMediaConverter');
42
43    # Check that PDFBox is installed and available on the path
44    $pdfbox_conversion_available = 1;
45    $no_pdfbox_conversion_reason = "";
46   
47    if (!defined $ENV{'GEXT_PDFBOX'}) {
48    $pdfbox_conversion_available = 0;
49    $no_pdfbox_conversion_reason = "gextpdfboxnotinstalled";
50    }
51    else {
52    my $gextpb_home = $ENV{'GEXT_PDFBOX'};
53    my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");
54
55    if (!-e $pbajar) {
56        print STDERR "Failed to find $pbajar\n";
57        $pdfbox_conversion_available = 0;
58        $no_pdfbox_conversion_reason = "gextpdfboxjarnotinstalled";
59    }
60    else {
61        # test to see if java is in path     
62        my $cmd = "java";
63        if ($ENV{'GSDLOS'} =~ /^windows/i) {
64        $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
65        }
66        else {
67        # On Ubuntu, java >/dev/null 2>&1 works,
68        # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
69        $cmd .= " >/dev/null 2>&1 &"; # Need the & at the end for Linux Centos (Ubuntu is fine without it), else parsing plugins list fails
70        }
71
72        my $status = system($cmd);
73
74        if ($status != 0) {
75        print STDERR "Testing for java\n";
76        print STDERR "Failed to run: $cmd\n";
77        print STDERR "$!\n";
78        $pdfbox_conversion_available = 0;
79        $no_pdfbox_conversion_reason = "couldnotrunjava";
80        }
81    }
82    }
83
84}
85
86my $arguments = [ ];
87
88my $options = { 'name' => "PDFBoxConverter",
89        'desc' => "{PDFBoxConverter.desc}",
90        'abstract' => "yes",
91        'inherits' => "yes",
92        'args' => $arguments };
93
94sub new {
95    my ($class) = shift (@_);
96    my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
97    push(@$pluginlist, $class);
98
99    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
100    push(@{$hashArgOptLists->{"OptList"}},$options);
101
102
103    my $self = new BaseMediaConverter($pluginlist, $inputargs,
104                      $hashArgOptLists, $auxilary);
105
106    if ($self->{'info_only'}) {
107    # don't worry about any options etc
108    return bless $self, $class;
109    }
110    if ($pdfbox_conversion_available) {
111    my $gextpb_home = $ENV{'GEXT_PDFBOX'};
112    my $pbajar = &util::filename_cat($gextpb_home,"lib","java","pdfbox-app.jar");   
113    my $launch_cmd = "java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText";
114   
115    $self->{'pdfbox_launch_cmd'} = $launch_cmd;
116    }
117    else {       
118    $self->{'no_pdfbox_conversion_reason'} = $no_pdfbox_conversion_reason;
119
120    my $outhandle = $self->{'outhandle'};
121    &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_pdfbox_conversion_reason})\n");
122    } 
123
124    $self->{'pdfbox_conversion_available'} = $pdfbox_conversion_available;
125   
126    return bless $self, $class;
127
128}
129
130sub init {
131    my $self = shift(@_);
132    my ($verbosity, $outhandle, $failhandle) = @_;
133
134    $self->{'pbtmp_file_paths'} = ();
135}
136
137sub deinit {
138    my $self = shift(@_);
139
140    $self->clean_up_temporary_files();
141}
142
143
144sub convert {
145    my $self = shift(@_);
146    my ($source_file_full_path, $target_file_type) = @_;
147
148    return 0 unless $pdfbox_conversion_available;
149    # check the filename
150    return 0 if ( !-f $source_file_full_path);
151
152    # the following line is necessary to avoid 'uninitialised variable' error
153    # messages concerning the converted_to member variable when PDFPlugin's
154    # use_sections option is checked.
155    # PDFBox plugin now processes use_sections option, when working with v1.5.0
156    # of the PDFBox jar file (which embeds each page in special <div> tags).
157    if ($target_file_type eq "html") {
158    $self->{'converted_to'} = "HTML";
159    } else {
160    $self->{'converted_to'} = "text";
161    }
162
163    my $outhandle = $self->{'outhandle'};
164    my $verbosity = $self->{'verbosity'};
165
166    my $source_file_no_path = &File::Basename::basename($source_file_full_path);
167    # Determine the full name and path of the output file
168    my $target_file_path;
169    if ($self->{'enable_cache'}) {
170    $self->init_cache_for_file($source_file_full_path);
171    my $cache_dir = $self->{'cached_dir'};
172    my $file_root = $self->{'cached_file_root'};
173    #$file_root .= "_$convert_id" if ($convert_id ne "");
174    my $target_file = "$file_root.$target_file_type";
175    $target_file_path = &util::filename_cat($cache_dir,$target_file);
176    }
177    else {
178    # this is in gsdl/tmp. get a tmp filename in collection instead???
179    $target_file_path = &util::get_tmp_filename($target_file_type);
180    push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
181    }
182
183    # Generate and run the convert command
184    my $convert_cmd = $self->{'pdfbox_launch_cmd'};
185    $convert_cmd .= " -html" if ($target_file_type eq "html");
186    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
187
188    if ($verbosity>2) {
189    print $outhandle "Convert command: $convert_cmd\n";
190    }
191
192    my $print_info = { 'message_prefix' => "PDFBox Conversion",
193               'message' => "Converting $source_file_no_path to: $target_file_type" };
194    # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
195
196    my ($regenerated,$result,$had_error)
197    = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
198    if ($had_error) {
199    return (0, $result,$target_file_path);
200    }
201    return (1, $result,$target_file_path);
202}
203
204sub convert_without_result {
205    my $self = shift(@_);
206
207    my $source_file_path = shift(@_);
208    my $target_file_type = shift(@_);
209    my $convert_options  = shift(@_) || "";
210    my $convert_id       = shift(@_) || "";
211
212    return $self->convert($source_file_path,$target_file_type,
213              $convert_options,$convert_id,"without_result");
214}
215
216sub clean_up_temporary_files {
217    my $self = shift(@_);
218
219    foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
220    if (-e $pbtmp_file_path) {
221        &util::rm($pbtmp_file_path);
222    }
223    }
224
225    $self->{'pbtmp_file_paths'} = ();
226}
227
228
229
2301; 
Note: See TracBrowser for help on using the browser.