root/main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm @ 32026

Revision 32026, 12.6 KB (checked in by ak19, 2 years ago)

Some more placeholder strings for the UnknownConverterPlugin? to recognise and replace appropriately

Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,
48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm
49# Then a folder of html files is generated per document?
50# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
51
52sub BEGIN {
53    @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
54}
55
56my $convert_to_list =
57    [ { 'name' => "text",
58    'desc' => "{ConvertBinaryFile.convert_to.text}" },
59      { 'name' => "html",
60    'desc' => "{ConvertBinaryFile.convert_to.html}" },
61      { 'name' => "pagedimg_jpg",
62    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
63      { 'name' => "pagedimg_gif",
64    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
65      { 'name' => "pagedimg_png",
66    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
67      ];
68
69my $arguments =
70    [ { 'name' => "exec_cmd",
71    'desc' => "{UnknownConverterPlugin.exec_cmd}",
72    'type' => "string",
73    'deft' => "",
74    'reqd' => "yes" },
75      { 'name' => "convert_to",
76    'desc' => "{ConvertBinaryFile.convert_to}",
77    'type' => "enum",
78    'reqd' => "yes",
79    'list' => $convert_to_list,
80    'deft' => "text" } ];
81
82my $options = { 'name'     => "UnknownConverterPlugin",
83        'desc'     => "{UnknownConverterPlugin.desc}",
84        'abstract' => "no",
85        'inherits' => "yes",
86        'args'     => $arguments };
87
88
89sub new {
90    my ($class) = shift (@_);
91    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
92    push(@$pluginlist, $class);
93
94    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
95    push(@{$hashArgOptLists->{"OptList"}},$options);
96
97    my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
98    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99   
100    # Need to feed the superclass plugins to merge_inheritance() below in the order that the
101    # superclass plugins were declared in the ISA listing earlier in this file:
102    my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
103
104    $self = bless $self, $class;
105
106my $outhandle = $self->{'outhandle'};
107    if(!defined $self->{'convert_to'}) {
108    $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration????
109    }
110    #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
111
112    # Convert_To set up, including secondary_plugins for processing the text or html generated
113    # set convert_to_plugin and convert_to_ext
114    $self->set_standard_convert_settings();
115
116    # one-time replacements in exec_cmd happen here
117    # don't nest GSDL env paths in (escaped) double quotes, as they're likely to be part of larger paths
118    # It's up to the user to do the (escaped) double-quoting around paths.
119    my $cmd = $self->{'exec_cmd'};
120    $cmd =~ s@%GSDL3SRCHOME@$ENV{'GSDL3SRCHOME'}@g;
121    $cmd =~ s@%GSDLHOME@$ENV{'GSDLHOME'}@g;
122    $cmd =~ s@%GSDL3HOME@$ENV{'GSDL3HOME'}@g;
123    $self->{'exec_cmd'} = $cmd;
124   
125    my $secondary_plugin_name = $self->{'convert_to_plugin'};
126    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
127
128    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
129    $secondary_plugin_options->{$secondary_plugin_name} = [];
130    }
131    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
132
133    # using defaults for secondary plugins, taken from RTFPlugin
134    push(@$specific_options, "-file_rename_method", "none");
135    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
136    if ($secondary_plugin_name eq "TextPlugin") {
137    push(@$specific_options, "-input_encoding", "utf8");
138    }
139    elsif ($secondary_plugin_name eq "HTMLPlugin") {
140    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
141    push(@$specific_options, "-processing_tmp_files");
142    }
143    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
144    push(@$specific_options, "-screenviewsize", "1000");
145    push(@$specific_options, "-enable_cache");
146    push(@$specific_options, "-processing_tmp_files");
147    }
148
149    # bless again, copied from PDFPlugin, PowerPointPlugin
150    $self = bless $self, $class;
151    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
152    return $self;
153}
154
155# Called by UnknownPlugin::process()
156# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
157# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
158sub add_dummy_text {
159    my $self = shift(@_);
160}
161
162# Are init, begin and deinit necessary (will they not get called automatically)?
163# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here.
164# Copied here from PDFPlugin, PowerPointPlugin
165# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
166# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
167# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
168sub init {
169    my $self = shift (@_);
170
171    # ConvertBinaryFile init
172    $self->ConvertBinaryFile::init(@_);
173}
174
175sub begin {
176    my $self = shift (@_);
177
178    $self->ConvertBinaryFile::begin(@_);
179
180}
181
182sub deinit {
183    my $self = shift (@_);
184   
185    $self->ConvertBinaryFile::deinit(@_);
186
187}
188
189# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion
190# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual
191# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine
192# will work out. Then it will run the conversion command.
193sub run_conversion_command {
194    my $self = shift (@_);
195    my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;   
196   
197    my $outhandle = $self->{'outhandle'};
198    my $convert_to = $self->{'convert_to'};
199    my $failhandle = $self->{'failhandle'};
200    my $verbosity = $self->{'verbosity'};
201   
202    my $convert_to_ext = $self->{'convert_to_ext'};
203    if ($verbosity > 0) {
204    print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
205    }
206
207    # The command to be executed must be provided the input filename and output file/dir name
208    # input filename = tmp_filename
209    # 1. We now work out the output filename. Code for it comes from
210    # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified
211
212    my $output_type=$self->{'convert_to'};
213
214    # store the *actual* output type and return the output filename
215    # it's possible we requested conversion to html, but only to text succeeded
216    #$self->{'convert_to_ext'} = $output_type;
217    if ($output_type =~ /html/i) {
218    $self->{'converted_to'} = "HTML";
219    } elsif ($output_type =~ /te?xt/i) {
220    $self->{'converted_to'} = "Text";
221    } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
222    $self->{'converted_to'} = "PagedImage";
223    }
224   
225    my $output_filename = $tmp_filename;
226    my $output_dirname;
227    if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
228    # running under windows
229    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
230        $output_dirname = $tmp_dirname . "\\$utf8_tailname\\";
231    } else {
232        $output_dirname = $tmp_dirname . "\/$utf8_tailname\/";
233    }
234    $output_filename = $output_dirname . $utf8_tailname . ".item";
235    } else {
236    $output_filename =~ s/$lc_suffix$/.$output_type/;
237    }
238
239
240    # 2. Execute the conversion command and get the type of the result,
241    # making sure the converter gives us the appropriate output type
242
243    # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
244    # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
245    # If running the command returns 0, let's assume success and so the act of running the command
246    # should produce either a text file or output to stdout.
247
248    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
249
250    my $cmd = $self->{'exec_cmd'};
251    if(!$cmd) { # empty string for instance
252    print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
253    return "";
254    }
255
256    # replace remaining occurrences of placeholders in cmd string, the ones that change per input file
257    #$cmd =~ s@\"@\\"@g;
258    $cmd =~ s@%INPUT_FILE@\"$tmp_filename\"@g; # refer to the softlink
259    if(defined $output_dirname) {
260    $cmd =~ s@%OUTPUT@\"$output_dirname\"@g;
261    } else {
262    $cmd =~ s@%OUTPUT@\"$output_filename\"@g;
263    }
264
265    # Some debugging
266    if ($self->{'verbosity'} > 2) {
267    print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n";
268    print STDERR "   on infile |$tmp_filename|\n";
269    print STDERR "   to produce expected $output_filename\n";
270    }
271
272    # Run the command at last
273    my $status = system($cmd);
274
275    if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
276    print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
277    return "";
278    }
279
280    if($status != 0) {
281    print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
282    return "";
283    }
284
285    # remove symbolic link to original file
286    &FileUtils::removeFiles($tmp_filename);
287
288
289    if(defined $output_dirname && ! -d $output_dirname) {
290    print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
291    return "";
292    }
293    elsif (! -f $output_filename) {
294    print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
295    return "";
296    }
297
298    # else, conversion success
299   
300    # if multiple images were generated by running the conversion
301    if ($self->{'convert_to'} =~ /^pagedimg/) {
302    my $item_filename = $self->generate_item_file($output_filename);
303
304    if (!-e $item_filename) {
305        print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
306        return "";
307    }   
308    $output_filename = $item_filename;
309    }
310
311    $self->{'output_dirname'} = $output_dirname;
312    $self->{'output_filename'} = $output_filename;
313   
314    return $output_filename;
315
316}
317
318
319# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
320sub read_into_doc_obj {
321    my $self = shift (@_);
322    $self->ConvertBinaryFile::read_into_doc_obj(@_);
323}
324
325sub process {
326    my $self = shift (@_);
327    $self->UnknownPlugin::process(@_);
328}
329
330
3311;
Note: See TracBrowser for help on using the browser.