source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm@ 32026

Last change on this file since 32026 was 32026, checked in by ak19, 7 years ago

Some more placeholder strings for the UnknownConverterPlugin to recognise and replace appropriately

File size: 12.6 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,
48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm
49# Then a folder of html files is generated per document?
50# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
51
52sub BEGIN {
53 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
54}
55
56my $convert_to_list =
57 [ { 'name' => "text",
58 'desc' => "{ConvertBinaryFile.convert_to.text}" },
59 { 'name' => "html",
60 'desc' => "{ConvertBinaryFile.convert_to.html}" },
61 { 'name' => "pagedimg_jpg",
62 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
63 { 'name' => "pagedimg_gif",
64 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
65 { 'name' => "pagedimg_png",
66 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
67 ];
68
69my $arguments =
70 [ { 'name' => "exec_cmd",
71 'desc' => "{UnknownConverterPlugin.exec_cmd}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "yes" },
75 { 'name' => "convert_to",
76 'desc' => "{ConvertBinaryFile.convert_to}",
77 'type' => "enum",
78 'reqd' => "yes",
79 'list' => $convert_to_list,
80 'deft' => "text" } ];
81
82my $options = { 'name' => "UnknownConverterPlugin",
83 'desc' => "{UnknownConverterPlugin.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
88
89sub new {
90 my ($class) = shift (@_);
91 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
92 push(@$pluginlist, $class);
93
94 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
95 push(@{$hashArgOptLists->{"OptList"}},$options);
96
97 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
98 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 # Need to feed the superclass plugins to merge_inheritance() below in the order that the
101 # superclass plugins were declared in the ISA listing earlier in this file:
102 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
103
104 $self = bless $self, $class;
105
106my $outhandle = $self->{'outhandle'};
107 if(!defined $self->{'convert_to'}) {
108 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration????
109 }
110 #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
111
112 # Convert_To set up, including secondary_plugins for processing the text or html generated
113 # set convert_to_plugin and convert_to_ext
114 $self->set_standard_convert_settings();
115
116 # one-time replacements in exec_cmd happen here
117 # don't nest GSDL env paths in (escaped) double quotes, as they're likely to be part of larger paths
118 # It's up to the user to do the (escaped) double-quoting around paths.
119 my $cmd = $self->{'exec_cmd'};
120 $cmd =~ s@%GSDL3SRCHOME@$ENV{'GSDL3SRCHOME'}@g;
121 $cmd =~ s@%GSDLHOME@$ENV{'GSDLHOME'}@g;
122 $cmd =~ s@%GSDL3HOME@$ENV{'GSDL3HOME'}@g;
123 $self->{'exec_cmd'} = $cmd;
124
125 my $secondary_plugin_name = $self->{'convert_to_plugin'};
126 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
127
128 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
129 $secondary_plugin_options->{$secondary_plugin_name} = [];
130 }
131 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
132
133 # using defaults for secondary plugins, taken from RTFPlugin
134 push(@$specific_options, "-file_rename_method", "none");
135 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
136 if ($secondary_plugin_name eq "TextPlugin") {
137 push(@$specific_options, "-input_encoding", "utf8");
138 }
139 elsif ($secondary_plugin_name eq "HTMLPlugin") {
140 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
141 push(@$specific_options, "-processing_tmp_files");
142 }
143 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
144 push(@$specific_options, "-screenviewsize", "1000");
145 push(@$specific_options, "-enable_cache");
146 push(@$specific_options, "-processing_tmp_files");
147 }
148
149 # bless again, copied from PDFPlugin, PowerPointPlugin
150 $self = bless $self, $class;
151 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
152 return $self;
153}
154
155# Called by UnknownPlugin::process()
156# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
157# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
158sub add_dummy_text {
159 my $self = shift(@_);
160}
161
162# Are init, begin and deinit necessary (will they not get called automatically)?
163# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here.
164# Copied here from PDFPlugin, PowerPointPlugin
165# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
166# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
167# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
168sub init {
169 my $self = shift (@_);
170
171 # ConvertBinaryFile init
172 $self->ConvertBinaryFile::init(@_);
173}
174
175sub begin {
176 my $self = shift (@_);
177
178 $self->ConvertBinaryFile::begin(@_);
179
180}
181
182sub deinit {
183 my $self = shift (@_);
184
185 $self->ConvertBinaryFile::deinit(@_);
186
187}
188
189# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion
190# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual
191# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine
192# will work out. Then it will run the conversion command.
193sub run_conversion_command {
194 my $self = shift (@_);
195 my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
196
197 my $outhandle = $self->{'outhandle'};
198 my $convert_to = $self->{'convert_to'};
199 my $failhandle = $self->{'failhandle'};
200 my $verbosity = $self->{'verbosity'};
201
202 my $convert_to_ext = $self->{'convert_to_ext'};
203 if ($verbosity > 0) {
204 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
205 }
206
207 # The command to be executed must be provided the input filename and output file/dir name
208 # input filename = tmp_filename
209 # 1. We now work out the output filename. Code for it comes from
210 # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified
211
212 my $output_type=$self->{'convert_to'};
213
214 # store the *actual* output type and return the output filename
215 # it's possible we requested conversion to html, but only to text succeeded
216 #$self->{'convert_to_ext'} = $output_type;
217 if ($output_type =~ /html/i) {
218 $self->{'converted_to'} = "HTML";
219 } elsif ($output_type =~ /te?xt/i) {
220 $self->{'converted_to'} = "Text";
221 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
222 $self->{'converted_to'} = "PagedImage";
223 }
224
225 my $output_filename = $tmp_filename;
226 my $output_dirname;
227 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
228 # running under windows
229 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
230 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\";
231 } else {
232 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/";
233 }
234 $output_filename = $output_dirname . $utf8_tailname . ".item";
235 } else {
236 $output_filename =~ s/$lc_suffix$/.$output_type/;
237 }
238
239
240 # 2. Execute the conversion command and get the type of the result,
241 # making sure the converter gives us the appropriate output type
242
243 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
244 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
245 # If running the command returns 0, let's assume success and so the act of running the command
246 # should produce either a text file or output to stdout.
247
248 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
249
250 my $cmd = $self->{'exec_cmd'};
251 if(!$cmd) { # empty string for instance
252 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
253 return "";
254 }
255
256 # replace remaining occurrences of placeholders in cmd string, the ones that change per input file
257 #$cmd =~ s@\"@\\"@g;
258 $cmd =~ s@%INPUT_FILE@\"$tmp_filename\"@g; # refer to the softlink
259 if(defined $output_dirname) {
260 $cmd =~ s@%OUTPUT@\"$output_dirname\"@g;
261 } else {
262 $cmd =~ s@%OUTPUT@\"$output_filename\"@g;
263 }
264
265 # Some debugging
266 if ($self->{'verbosity'} > 2) {
267 print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n";
268 print STDERR " on infile |$tmp_filename|\n";
269 print STDERR " to produce expected $output_filename\n";
270 }
271
272 # Run the command at last
273 my $status = system($cmd);
274
275 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
276 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
277 return "";
278 }
279
280 if($status != 0) {
281 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
282 return "";
283 }
284
285 # remove symbolic link to original file
286 &FileUtils::removeFiles($tmp_filename);
287
288
289 if(defined $output_dirname && ! -d $output_dirname) {
290 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
291 return "";
292 }
293 elsif (! -f $output_filename) {
294 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
295 return "";
296 }
297
298 # else, conversion success
299
300 # if multiple images were generated by running the conversion
301 if ($self->{'convert_to'} =~ /^pagedimg/) {
302 my $item_filename = $self->generate_item_file($output_filename);
303
304 if (!-e $item_filename) {
305 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
306 return "";
307 }
308 $output_filename = $item_filename;
309 }
310
311 $self->{'output_dirname'} = $output_dirname;
312 $self->{'output_filename'} = $output_filename;
313
314 return $output_filename;
315
316}
317
318
319# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
320sub read_into_doc_obj {
321 my $self = shift (@_);
322 $self->ConvertBinaryFile::read_into_doc_obj(@_);
323}
324
325sub process {
326 my $self = shift (@_);
327 $self->UnknownPlugin::process(@_);
328}
329
330
3311;
Note: See TracBrowser for help on using the repository browser.