source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm

Last change on this file was 35401, checked in by anupama, 3 years ago

Committing Dr Bainbridge's improvements to the Tika-preconfigured UnknownConverterPlugin: 1. Introducing the OS-agnostic %%GSDLHOME variable into the model collConfig.xml file which the UnknownConverterPlugin.pm will replace with or %GSDLHOME% as needed. The perl file will now also handle GSDL3HOME and GSDL3SRCHOME similarly. 2. The tika-app-1.24.1.jar is now renamed to just tika-app.jar so that UnknownConverterPlugin's exec_cmd works on Windows too, where there is no file globbing or wildcard to expand tika-app*.jar as there was on Linux. The gs2build/ext/tika folder's README has been updated to mention the version number of the tika-app jar file we're using.

File size: 13.0 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,
48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm
49# Then a folder of html files is generated per document?
50# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
51
52sub BEGIN {
53 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
54}
55
56my $convert_to_list =
57 [ { 'name' => "text",
58 'desc' => "{ConvertBinaryFile.convert_to.text}" },
59 { 'name' => "html",
60 'desc' => "{ConvertBinaryFile.convert_to.html}" },
61 { 'name' => "pagedimg_jpg",
62 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
63 { 'name' => "pagedimg_gif",
64 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
65 { 'name' => "pagedimg_png",
66 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
67 ];
68
69my $arguments =
70 [ { 'name' => "exec_cmd",
71 'desc' => "{UnknownConverterPlugin.exec_cmd}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "yes" },
75 { 'name' => "convert_to",
76 'desc' => "{ConvertBinaryFile.convert_to}",
77 'type' => "enum",
78 'reqd' => "yes",
79 'list' => $convert_to_list,
80 'deft' => "text" } ];
81
82my $options = { 'name' => "UnknownConverterPlugin",
83 'desc' => "{UnknownConverterPlugin.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
88
89sub new {
90 my ($class) = shift (@_);
91 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
92 push(@$pluginlist, $class);
93
94 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
95 push(@{$hashArgOptLists->{"OptList"}},$options);
96
97 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists, 1);
98 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 # Need to feed the superclass plugins to merge_inheritance() below in the order that the
101 # superclass plugins were declared in the ISA listing earlier in this file:
102 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
103
104 $self = bless $self, $class;
105
106 if ($self->{'info_only'}) {
107
108 # If running pluginfo, we don't need to go further. Copied from other plugins like PDFPlug
109 # Returning here when running pluginfo also means that it doesn't matter that convert_to
110 # is not yet set to its default value at this stage: no warnings at gli start (when pluginfo
111 # is run) about convert_to being undefined
112
113 # don't worry about any options etc
114 return $self;
115 }
116
117 my $outhandle = $self->{'outhandle'};
118
119 # Convert_To set up, including secondary_plugins for processing the text or html generated
120 # set convert_to_plugin and convert_to_ext
121 $self->set_standard_convert_settings();
122
123 # one-time replacements in exec_cmd happen here
124 # don't nest GSDL env paths in (escaped) double quotes, as they're likely to be part of larger paths
125 # It's up to the user to do the (escaped) double-quoting around paths.
126 my $cmd = $self->{'exec_cmd'};
127 if($cmd) {
128 $cmd =~ s@%%GSDL3SRCHOME@$ENV{'GSDL3SRCHOME'}@g;
129 $cmd =~ s@%%GSDLHOME@$ENV{'GSDLHOME'}@g;
130 $cmd =~ s@%%GSDL3HOME@$ENV{'GSDL3HOME'}@g;
131 $self->{'exec_cmd'} = $cmd;
132 }
133
134 my $secondary_plugin_name = $self->{'convert_to_plugin'};
135 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
136
137 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
138 $secondary_plugin_options->{$secondary_plugin_name} = [];
139 }
140 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
141
142 # using defaults for secondary plugins, taken from RTFPlugin
143 push(@$specific_options, "-file_rename_method", "none");
144 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
145 if ($secondary_plugin_name eq "TextPlugin") {
146 push(@$specific_options, "-input_encoding", "utf8");
147 }
148 elsif ($secondary_plugin_name eq "HTMLPlugin") {
149 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
150 push(@$specific_options, "-processing_tmp_files");
151 }
152 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
153 push(@$specific_options, "-screenviewsize", "1000");
154 push(@$specific_options, "-enable_cache");
155 push(@$specific_options, "-processing_tmp_files");
156 }
157
158 # bless again, copied from PDFPlugin, PowerPointPlugin
159 $self = bless $self, $class;
160 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
161 return $self;
162}
163
164# Called by UnknownPlugin::process()
165# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
166# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
167sub add_dummy_text {
168 my $self = shift(@_);
169}
170
171# Are init, begin and deinit necessary (will they not get called automatically)?
172# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here.
173# Copied here from PDFPlugin, PowerPointPlugin
174# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
175# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
176# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
177sub init {
178 my $self = shift (@_);
179
180 # ConvertBinaryFile init
181 $self->ConvertBinaryFile::init(@_);
182}
183
184sub begin {
185 my $self = shift (@_);
186
187 $self->ConvertBinaryFile::begin(@_);
188
189}
190
191sub deinit {
192 my $self = shift (@_);
193
194 $self->ConvertBinaryFile::deinit(@_);
195
196}
197
198# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion
199# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual
200# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine
201# will work out. Then it will run the conversion command.
202sub run_conversion_command {
203 my $self = shift (@_);
204 my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
205
206 my $outhandle = $self->{'outhandle'};
207 my $convert_to = $self->{'convert_to'};
208 my $failhandle = $self->{'failhandle'};
209 my $verbosity = $self->{'verbosity'};
210
211 my $convert_to_ext = $self->{'convert_to_ext'};
212 if ($verbosity > 0) {
213 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
214 }
215
216 # The command to be executed must be provided the input filename and output file/dir name
217 # input filename = tmp_filename
218 # 1. We now work out the output filename. Code for it comes from
219 # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified
220
221 my $output_type=$self->{'convert_to'};
222
223 # store the *actual* output type and return the output filename
224 # it's possible we requested conversion to html, but only to text succeeded
225 #$self->{'convert_to_ext'} = $output_type;
226 if ($output_type =~ /html/i) {
227 $self->{'converted_to'} = "HTML";
228 } elsif ($output_type =~ /te?xt/i) {
229 $self->{'converted_to'} = "Text";
230 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
231 $self->{'converted_to'} = "PagedImage";
232 }
233
234 my $output_filename = $tmp_filename;
235 my $output_dirname;
236 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
237 # running under windows
238 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
239 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\";
240 } else {
241 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/";
242 }
243 $output_filename = $output_dirname . $utf8_tailname . ".item";
244 } else {
245 $output_filename =~ s/$lc_suffix$/.$output_type/;
246 }
247
248
249 # 2. Execute the conversion command and get the type of the result,
250 # making sure the converter gives us the appropriate output type
251
252 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
253 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
254 # If running the command returns 0, let's assume success and so the act of running the command
255 # should produce either a text file or output to stdout.
256
257 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
258
259 my $cmd = $self->{'exec_cmd'};
260 if(!$cmd) { # empty string for instance
261 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
262 return "";
263 }
264
265 # replace remaining occurrences of placeholders in cmd string, the ones that change per input file
266 #$cmd =~ s@\"@\\"@g;
267 $cmd =~ s@%%INPUT_FILE@\"$tmp_filename\"@g; # refer to the softlink
268 if(defined $output_dirname) {
269 $cmd =~ s@%%OUTPUT@\"$output_dirname\"@g;
270 } else {
271 $cmd =~ s@%%OUTPUT@\"$output_filename\"@g;
272 }
273
274 # Allow the user to use %%GSDL(3|3SRC)HOME and replace them here with the
275 # OS-specific $GSDL(3|3SRC)HOME or %GSDL(3|3SRC)HOME%
276 $cmd =~ s@%%GSDLHOME@\"$ENV{'GSDLHOME'}\"@g;
277 $cmd =~ s@%%GSDL3HOME@\"$ENV{'GSDL3HOME'}\"@g;
278 $cmd =~ s@%%GSDL3SRCHOME@\"$ENV{'GSDL3SRCHOME'}\"@g;
279
280 # Some debugging
281 if ($self->{'verbosity'} > 2) {
282 print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n";
283 print STDERR " on infile |$tmp_filename|\n";
284 print STDERR " to produce expected $output_filename\n";
285 }
286
287 # Run the command at last
288 my $status = system($cmd);
289
290 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
291 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
292 return "";
293 }
294
295 if($status != 0) {
296 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
297 return "";
298 }
299
300 # remove symbolic link to original file
301 &FileUtils::removeFiles($tmp_filename);
302
303
304 if(defined $output_dirname && ! -d $output_dirname) {
305 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
306 return "";
307 }
308 elsif (! -f $output_filename) {
309 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
310 return "";
311 }
312
313 # else, conversion success
314
315 # if multiple images were generated by running the conversion
316 if ($self->{'convert_to'} =~ /^pagedimg/) {
317 my $item_filename = $self->generate_item_file($output_filename);
318
319 if (!-e $item_filename) {
320 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
321 return "";
322 }
323 $output_filename = $item_filename;
324 }
325
326 $self->{'output_dirname'} = $output_dirname;
327 $self->{'output_filename'} = $output_filename;
328
329 return $output_filename;
330
331}
332
333
334# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
335sub read_into_doc_obj {
336 my $self = shift (@_);
337 $self->ConvertBinaryFile::read_into_doc_obj(@_);
338}
339
340sub process {
341 my $self = shift (@_);
342 $self->UnknownPlugin::process(@_);
343}
344
345
3461;
Note: See TracBrowser for help on using the repository browser.