source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm@ 31888

Last change on this file since 31888 was 31766, checked in by ak19, 7 years ago
  1. Refactored ConvertBinaryFile:tmp_area_convert_file() to do the conversion command (final portion of the function) in a separate subroutine, the new run_conversion_command(), so that the new subclass UnknownConverterPlugin can override this method. 2. UnknownConverterPlugin currently does the unique portions of its previously overridden tmp_area_convert_file() in the new run_conversion_command() that it now overrides, since it inherits all of tmp_area_convert_file(). 3. Removed currently unused overridden read() method. Also removed some other unwanted things.
File size: 12.1 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,
48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm
49# Then a folder of html files is generated per document?
50# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
51
52sub BEGIN {
53 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
54}
55
56my $convert_to_list =
57 [ { 'name' => "text",
58 'desc' => "{ConvertBinaryFile.convert_to.text}" },
59 { 'name' => "html",
60 'desc' => "{ConvertBinaryFile.convert_to.html}" },
61 { 'name' => "pagedimg_jpg",
62 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
63 { 'name' => "pagedimg_gif",
64 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
65 { 'name' => "pagedimg_png",
66 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
67 ];
68
69my $arguments =
70 [ { 'name' => "exec_cmd",
71 'desc' => "{UnknownConverterPlugin.exec_cmd}",
72 'type' => "string",
73 'deft' => "",
74 'reqd' => "yes" },
75 { 'name' => "convert_to",
76 'desc' => "{ConvertBinaryFile.convert_to}",
77 'type' => "enum",
78 'reqd' => "yes",
79 'list' => $convert_to_list,
80 'deft' => "text" } ];
81
82my $options = { 'name' => "UnknownConverterPlugin",
83 'desc' => "{UnknownConverterPlugin.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
88
89sub new {
90 my ($class) = shift (@_);
91 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
92 push(@$pluginlist, $class);
93
94 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
95 push(@{$hashArgOptLists->{"OptList"}},$options);
96
97 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
98 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 # Need to feed the superclass plugins to merge_inheritance() below in the order that the
101 # superclass plugins were declared in the ISA listing earlier in this file:
102 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
103
104 $self = bless $self, $class;
105
106my $outhandle = $self->{'outhandle'};
107 if(!defined $self->{'convert_to'}) {
108 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration????
109 }
110 #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
111
112 # Convert_To set up, including secondary_plugins for processing the text or html generated
113 # set convert_to_plugin and convert_to_ext
114 $self->set_standard_convert_settings();
115
116 my $secondary_plugin_name = $self->{'convert_to_plugin'};
117 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
118
119 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
120 $secondary_plugin_options->{$secondary_plugin_name} = [];
121 }
122 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
123
124 # using defaults for secondary plugins, taken from RTFPlugin
125 push(@$specific_options, "-file_rename_method", "none");
126 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
127 if ($secondary_plugin_name eq "TextPlugin") {
128 push(@$specific_options, "-input_encoding", "utf8");
129 }
130 elsif ($secondary_plugin_name eq "HTMLPlugin") {
131 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
132 push(@$specific_options, "-processing_tmp_files");
133 }
134 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
135 push(@$specific_options, "-screenviewsize", "1000");
136 push(@$specific_options, "-enable_cache");
137 push(@$specific_options, "-processing_tmp_files");
138 }
139
140 # bless again, copied from PDFPlugin, PowerPointPlugin
141 $self = bless $self, $class;
142 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
143 return $self;
144}
145
146# Called by UnknownPlugin::process()
147# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
148# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
149sub add_dummy_text {
150 my $self = shift(@_);
151}
152
153# Are init, begin and deinit necessary (will they not get called automatically)?
154# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here.
155# Copied here from PDFPlugin, PowerPointPlugin
156# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
157# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
158# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
159sub init {
160 my $self = shift (@_);
161
162 # ConvertBinaryFile init
163 $self->ConvertBinaryFile::init(@_);
164}
165
166sub begin {
167 my $self = shift (@_);
168
169 $self->ConvertBinaryFile::begin(@_);
170
171}
172
173sub deinit {
174 my $self = shift (@_);
175
176 $self->ConvertBinaryFile::deinit(@_);
177
178}
179
180# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion
181# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual
182# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine
183# will work out. Then it will run the conversion command.
184sub run_conversion_command {
185 my $self = shift (@_);
186 my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
187
188 my $outhandle = $self->{'outhandle'};
189 my $convert_to = $self->{'convert_to'};
190 my $failhandle = $self->{'failhandle'};
191 my $verbosity = $self->{'verbosity'};
192
193 my $convert_to_ext = $self->{'convert_to_ext'};
194 if ($verbosity > 0) {
195 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
196 }
197
198 # The command to be executed must be provided the input filename and output file/dir name
199 # input filename = tmp_filename
200 # 1. We now work out the output filename. Code for it comes from
201 # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified
202
203 my $output_type=$self->{'convert_to'};
204
205 # store the *actual* output type and return the output filename
206 # it's possible we requested conversion to html, but only to text succeeded
207 #$self->{'convert_to_ext'} = $output_type;
208 if ($output_type =~ /html/i) {
209 $self->{'converted_to'} = "HTML";
210 } elsif ($output_type =~ /te?xt/i) {
211 $self->{'converted_to'} = "Text";
212 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
213 $self->{'converted_to'} = "PagedImage";
214 }
215
216 my $output_filename = $tmp_filename;
217 my $output_dirname;
218 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
219 # running under windows
220 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
221 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\";
222 } else {
223 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/";
224 }
225 $output_filename = $output_dirname . $utf8_tailname . ".item";
226 } else {
227 $output_filename =~ s/$lc_suffix$/.$output_type/;
228 }
229
230
231 # 2. Execute the conversion command and get the type of the result,
232 # making sure the converter gives us the appropriate output type
233
234 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
235 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
236 # If running the command returns 0, let's assume success and so the act of running the command
237 # should produce either a text file or output to stdout.
238
239 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
240
241 my $cmd = $self->{'exec_cmd'};
242 if(!$cmd) { # empty string for instance
243 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
244 return "";
245 }
246
247 # replace occurrences of placeholders in cmd string
248 #$cmd =~ s@\"@\\"@g;
249 $cmd =~ s@%INPUT_FILE@\"$tmp_filename\"@g; # refer to the softlink
250 if(defined $output_dirname) {
251 $cmd =~ s@%OUTPUT@\"$output_dirname\"@g;
252 } else {
253 $cmd =~ s@%OUTPUT@\"$output_filename\"@g;
254 }
255
256 # Some debugging
257 if ($self->{'verbosity'} > 2) {
258 print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n";
259 print STDERR " on infile |$tmp_filename|\n";
260 print STDERR " to produce expected $output_filename\n";
261 }
262
263 # Run the command at last
264 my $status = system($cmd);
265
266 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
267 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
268 return "";
269 }
270
271 if($status != 0) {
272 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
273 return "";
274 }
275
276 # remove symbolic link to original file
277 &FileUtils::removeFiles($tmp_filename);
278
279
280 if(defined $output_dirname && ! -d $output_dirname) {
281 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
282 return "";
283 }
284 elsif (! -f $output_filename) {
285 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
286 return "";
287 }
288
289 # else, conversion success
290
291 # if multiple images were generated by running the conversion
292 if ($self->{'convert_to'} =~ /^pagedimg/) {
293 my $item_filename = $self->generate_item_file($output_filename);
294
295 if (!-e $item_filename) {
296 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
297 return "";
298 }
299 $output_filename = $item_filename;
300 }
301
302 $self->{'output_dirname'} = $output_dirname;
303 $self->{'output_filename'} = $output_filename;
304
305 return $output_filename;
306
307}
308
309
310# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
311sub read_into_doc_obj {
312 my $self = shift (@_);
313 $self->ConvertBinaryFile::read_into_doc_obj(@_);
314}
315
316sub process {
317 my $self = shift (@_);
318 $self->UnknownPlugin::process(@_);
319}
320
321
3221;
Note: See TracBrowser for help on using the repository browser.