source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm@ 32028

Last change on this file since 32028 was 32028, checked in by ak19, 7 years ago
  1. Bugfix to previous commit: var might not be on an UnknownConverterPlugin, so the regexes on shouldn't take place then. 2. Shifted to using Kathy's double percent signs denoting placeholders, tested and works still.
File size: 12.6 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,
48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm
49# Then a folder of html files is generated per document?
50# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
51
52sub BEGIN {
53 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
54}
55
56my $convert_to_list =
57 [ { 'name' => "text",
58 'desc' => "{ConvertBinaryFile.convert_to.text}" },
59 { 'name' => "html",
60 'desc' => "{ConvertBinaryFile.convert_to.html}" },
61 { 'name' => "pagedimg_jpg",
62 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
63 { 'name' => "pagedimg_gif",
64 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
65 { 'name' => "pagedimg_png",
66 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
67 ];
68
69my $arguments =
70 [ { 'name' => "exec_cmd",
71 'desc' => "{UnknownConverterPlugin.exec_cmd}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "yes" },
75 { 'name' => "convert_to",
76 'desc' => "{ConvertBinaryFile.convert_to}",
77 'type' => "enum",
78 'reqd' => "yes",
79 'list' => $convert_to_list,
80 'deft' => "text" } ];
81
82my $options = { 'name' => "UnknownConverterPlugin",
83 'desc' => "{UnknownConverterPlugin.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
88
89sub new {
90 my ($class) = shift (@_);
91 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
92 push(@$pluginlist, $class);
93
94 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
95 push(@{$hashArgOptLists->{"OptList"}},$options);
96
97 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
98 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 # Need to feed the superclass plugins to merge_inheritance() below in the order that the
101 # superclass plugins were declared in the ISA listing earlier in this file:
102 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
103
104 $self = bless $self, $class;
105
106my $outhandle = $self->{'outhandle'};
107 if(!defined $self->{'convert_to'}) {
108 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration????
109 }
110 #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
111
112 # Convert_To set up, including secondary_plugins for processing the text or html generated
113 # set convert_to_plugin and convert_to_ext
114 $self->set_standard_convert_settings();
115
116 # one-time replacements in exec_cmd happen here
117 # don't nest GSDL env paths in (escaped) double quotes, as they're likely to be part of larger paths
118 # It's up to the user to do the (escaped) double-quoting around paths.
119 my $cmd = $self->{'exec_cmd'};
120 if($cmd) {
121 $cmd =~ s@%%GSDL3SRCHOME@$ENV{'GSDL3SRCHOME'}@g;
122 $cmd =~ s@%%GSDLHOME@$ENV{'GSDLHOME'}@g;
123 $cmd =~ s@%%GSDL3HOME@$ENV{'GSDL3HOME'}@g;
124 $self->{'exec_cmd'} = $cmd;
125 }
126
127 my $secondary_plugin_name = $self->{'convert_to_plugin'};
128 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
129
130 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
131 $secondary_plugin_options->{$secondary_plugin_name} = [];
132 }
133 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
134
135 # using defaults for secondary plugins, taken from RTFPlugin
136 push(@$specific_options, "-file_rename_method", "none");
137 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
138 if ($secondary_plugin_name eq "TextPlugin") {
139 push(@$specific_options, "-input_encoding", "utf8");
140 }
141 elsif ($secondary_plugin_name eq "HTMLPlugin") {
142 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
143 push(@$specific_options, "-processing_tmp_files");
144 }
145 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
146 push(@$specific_options, "-screenviewsize", "1000");
147 push(@$specific_options, "-enable_cache");
148 push(@$specific_options, "-processing_tmp_files");
149 }
150
151 # bless again, copied from PDFPlugin, PowerPointPlugin
152 $self = bless $self, $class;
153 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
154 return $self;
155}
156
157# Called by UnknownPlugin::process()
158# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
159# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
160sub add_dummy_text {
161 my $self = shift(@_);
162}
163
164# Are init, begin and deinit necessary (will they not get called automatically)?
165# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here.
166# Copied here from PDFPlugin, PowerPointPlugin
167# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
168# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
169# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
170sub init {
171 my $self = shift (@_);
172
173 # ConvertBinaryFile init
174 $self->ConvertBinaryFile::init(@_);
175}
176
177sub begin {
178 my $self = shift (@_);
179
180 $self->ConvertBinaryFile::begin(@_);
181
182}
183
184sub deinit {
185 my $self = shift (@_);
186
187 $self->ConvertBinaryFile::deinit(@_);
188
189}
190
191# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion
192# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual
193# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine
194# will work out. Then it will run the conversion command.
195sub run_conversion_command {
196 my $self = shift (@_);
197 my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
198
199 my $outhandle = $self->{'outhandle'};
200 my $convert_to = $self->{'convert_to'};
201 my $failhandle = $self->{'failhandle'};
202 my $verbosity = $self->{'verbosity'};
203
204 my $convert_to_ext = $self->{'convert_to_ext'};
205 if ($verbosity > 0) {
206 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
207 }
208
209 # The command to be executed must be provided the input filename and output file/dir name
210 # input filename = tmp_filename
211 # 1. We now work out the output filename. Code for it comes from
212 # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified
213
214 my $output_type=$self->{'convert_to'};
215
216 # store the *actual* output type and return the output filename
217 # it's possible we requested conversion to html, but only to text succeeded
218 #$self->{'convert_to_ext'} = $output_type;
219 if ($output_type =~ /html/i) {
220 $self->{'converted_to'} = "HTML";
221 } elsif ($output_type =~ /te?xt/i) {
222 $self->{'converted_to'} = "Text";
223 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
224 $self->{'converted_to'} = "PagedImage";
225 }
226
227 my $output_filename = $tmp_filename;
228 my $output_dirname;
229 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
230 # running under windows
231 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
232 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\";
233 } else {
234 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/";
235 }
236 $output_filename = $output_dirname . $utf8_tailname . ".item";
237 } else {
238 $output_filename =~ s/$lc_suffix$/.$output_type/;
239 }
240
241
242 # 2. Execute the conversion command and get the type of the result,
243 # making sure the converter gives us the appropriate output type
244
245 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
246 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
247 # If running the command returns 0, let's assume success and so the act of running the command
248 # should produce either a text file or output to stdout.
249
250 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
251
252 my $cmd = $self->{'exec_cmd'};
253 if(!$cmd) { # empty string for instance
254 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
255 return "";
256 }
257
258 # replace remaining occurrences of placeholders in cmd string, the ones that change per input file
259 #$cmd =~ s@\"@\\"@g;
260 $cmd =~ s@%%INPUT_FILE@\"$tmp_filename\"@g; # refer to the softlink
261 if(defined $output_dirname) {
262 $cmd =~ s@%%OUTPUT@\"$output_dirname\"@g;
263 } else {
264 $cmd =~ s@%%OUTPUT@\"$output_filename\"@g;
265 }
266
267 # Some debugging
268 if ($self->{'verbosity'} > 2) {
269 print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n";
270 print STDERR " on infile |$tmp_filename|\n";
271 print STDERR " to produce expected $output_filename\n";
272 }
273
274 # Run the command at last
275 my $status = system($cmd);
276
277 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
278 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
279 return "";
280 }
281
282 if($status != 0) {
283 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
284 return "";
285 }
286
287 # remove symbolic link to original file
288 &FileUtils::removeFiles($tmp_filename);
289
290
291 if(defined $output_dirname && ! -d $output_dirname) {
292 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
293 return "";
294 }
295 elsif (! -f $output_filename) {
296 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
297 return "";
298 }
299
300 # else, conversion success
301
302 # if multiple images were generated by running the conversion
303 if ($self->{'convert_to'} =~ /^pagedimg/) {
304 my $item_filename = $self->generate_item_file($output_filename);
305
306 if (!-e $item_filename) {
307 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
308 return "";
309 }
310 $output_filename = $item_filename;
311 }
312
313 $self->{'output_dirname'} = $output_dirname;
314 $self->{'output_filename'} = $output_filename;
315
316 return $output_filename;
317
318}
319
320
321# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
322sub read_into_doc_obj {
323 my $self = shift (@_);
324 $self->ConvertBinaryFile::read_into_doc_obj(@_);
325}
326
327sub process {
328 my $self = shift (@_);
329 $self->UnknownPlugin::process(@_);
330}
331
332
3331;
Note: See TracBrowser for help on using the repository browser.