root/main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak @ 31745

Revision 31745, 14.3 KB (checked in by ak19, 3 years ago)

Another change that's needed, this time to add the plugin.

Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47
48sub BEGIN {
49    @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
50}
51
52my $convert_to_list =
53    [ { 'name' => "text",
54    'desc' => "{ConvertBinaryFile.convert_to.text}" },
55      { 'name' => "html",
56    'desc' => "{ConvertBinaryFile.convert_to.html}" },
57      { 'name' => "pagedimg_jpg",
58    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
59      { 'name' => "pagedimg_gif",
60    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
61      { 'name' => "pagedimg_png",
62    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
63      ];
64
65my $arguments =
66    [ { 'name' => "exec_cmd",
67    'desc' => "{UnknownConverterPlugin.exec_cmd}",
68    'type' => "string",
69    'deft' => "",
70    'reqd' => "yes" },
71      { 'name' => "convert_to",
72    'desc' => "{ConvertBinaryFile.convert_to}",
73    'type' => "enum",
74    'reqd' => "yes",
75    'list' => $convert_to_list,
76    'deft' => "text" },
77      { 'name' => "output_file_or_dir_name",
78    'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}",
79    'type' => "string",
80    'reqd' => "yes",
81    'deft' => "" } ];
82
83my $options = { 'name'     => "UnknownConverterPlugin",
84        'desc'     => "{UnknownConverterPlugin.desc}",
85        'abstract' => "no",
86        'inherits' => "yes",
87        'args'     => $arguments };
88
89
90sub new {
91    my ($class) = shift (@_);
92    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
93    push(@$pluginlist, $class);
94
95    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
96    push(@{$hashArgOptLists->{"OptList"}},$options);
97
98    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99    my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
100    my $self = BaseImporter::merge_inheritance($cbf_self, $unknown_converter_self);
101
102    $self = bless $self, $class;
103
104$self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at start of this file???????
105
106    # Convert_To set up, including secondary_plugins for processing the text or html generated
107    # set convert_to_plugin and convert_to_ext
108    $self->set_standard_convert_settings();
109
110    my $secondary_plugin_name = $self->{'convert_to_plugin'};
111    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
112
113    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
114    $secondary_plugin_options->{$secondary_plugin_name} = [];
115    }
116    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
117
118    # using defaults for secondary plugins, taken from RTFPlugin
119    push(@$specific_options, "-file_rename_method", "none");
120    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
121    if ($secondary_plugin_name eq "TextPlugin") {
122    push(@$specific_options, "-input_encoding", "utf8");
123    }
124    elsif ($secondary_plugin_name eq "HTMLPlugin") {
125    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
126    push(@$specific_options, "-processing_tmp_files");
127    }
128    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
129    push(@$specific_options, "-screenviewsize", "1000");
130    push(@$specific_options, "-enable_cache");
131    push(@$specific_options, "-processing_tmp_files");
132    }
133
134    # bless again, copied from PDFPlugin, PowerPointPlugin
135    $self = bless $self, $class;
136    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
137    return $self;
138}
139
140# Are init, begin and deinit necessary (will they not get called automatically)?
141# Copied here from PDFPlugin, PowerPointPlugin
142# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
143# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
144# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
145sub init {
146    my $self = shift (@_);
147
148    # ConvertBinaryFile init
149    $self->ConvertBinaryFile::init(@_);
150}
151
152sub begin {
153    my $self = shift (@_);
154
155    $self->ConvertBinaryFile::begin(@_);
156
157}
158
159sub deinit {
160    my $self = shift (@_);
161   
162    $self->ConvertBinaryFile::deinit(@_);
163
164}
165
166# overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
167sub tmp_area_convert_file {
168    # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
169
170    my $self = shift (@_);
171    my ($output_ext, $input_filename, $textref) = @_;
172
173    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
174
175    # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
176    # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
177    # If running the command returns 0, let's assume success and so the act of running the command
178    # should produce either a text file or output to stdout.
179
180    my $outhandle=$self->{'outhandle'};
181
182    my $cmd = $self->{'exec_cmd'};
183    if(!$cmd) { # empty string for instance
184    print $outhandle "$plugin_name Conversion error: invalid cmd $cmd\n";
185    return "";
186    }
187
188    # replace occurrences of '*' placeholder in cmd string with input filename
189    my ($tailname, $dir, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
190    $cmd =~ s/\*/$tailname/g;
191    print STDERR "@@@@ $plugin_name: executing conversion cmd $cmd\n";
192    my $status = system($cmd);
193
194    if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
195    print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
196    return "";
197    }
198
199    if($status != 0) {
200    print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
201    return "";
202    }
203
204    my $output_file_or_dir = $self->{'output_file_or_dir_name'};
205    if (!-e $output_file_or_dir) {
206    print $outhandle "$plugin_name Conversion error: Output file/dir $output_file_or_dir doesn't exist\n";
207    return "";
208    }
209
210    # else, conversion success
211   
212    # if multiple images were generated by running the conversion
213    if ($self->{'convert_to'} eq "pagedimg") {
214    my $item_filename = $self->generate_item_file($output_file_or_dir);
215    return $item_filename;
216    }
217
218    return $output_file_or_dir;
219}
220
221# Copied from PowerPointPlugin, with some modifications
222# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
223sub read {
224    my $self = shift (@_); 
225    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
226   
227    # can we process this file??
228    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
229   
230    return undef unless $self->can_process_this_file($filename_full_path);
231   
232    my $output_file_or_dir = $self->{'output_file_or_dir_name'};
233    my $is_output_dir = (-d $output_file_or_dir) ? 1 : 0;
234
235    # we are only doing something special if we have a directory of html files
236    if (!$is_output_dir || $self->{'convert_to'} ne "html") {
237    return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
238    }
239    my $outhandle = $self->{'outhandle'};
240    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
241    print $outhandle "$self->{'plugin_type'} processing $file\n"
242        if $self->{'verbosity'} > 1;
243
244    my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
245    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
246    if (! -e "$conv_filename") {return -1;}
247
248    my ($tailname, $html_dirname, $suffix)
249    = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
250
251    my $collect_file = &util::filename_within_collection($filename_full_path);
252    my $dirname_within_collection = &util::filename_within_collection($html_dirname);
253    my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
254
255    my @dir;
256    if (!opendir (DIR, $html_dirname)) {
257    print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
258    # just process the original file
259    @dir = ("$tailname.$suffix");
260   
261    } else {
262    @dir = readdir (DIR);
263    closedir (DIR);
264    }
265
266    foreach my $file (@dir) {
267    next unless $file =~ /\.html$/;
268   
269    my ($rv, $doc_obj) =
270        $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
271    if ((!defined $rv) || ($rv<1)) {
272        # wasn't processed
273        return $rv;
274    }
275
276    # next block copied from ConvertBinaryFile
277    # from here ...
278    # Override previous gsdlsourcefilename set by secondary plugin
279   
280    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
281    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
282    # build. so set it manually.
283    $doc_obj->set_source_path($filename_full_path);
284    $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
285   
286    my $plugin_filename_encoding = $self->{'filename_encoding'};
287    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
288    $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
289       
290    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
291    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
292
293   
294    my ($tailname, $dirname, $suffix)
295        = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
296    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
297   
298
299    my $topsection = $doc_obj->get_top_section();
300    $self->add_associated_files($doc_obj, $filename_full_path);
301   
302    # extra_metadata is already called by sec plugin in process??
303    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
304    # do any automatic metadata extraction
305    $self->auto_extract_metadata ($doc_obj);
306   
307    # have we found a Title??
308    $self->title_fallback($doc_obj,$topsection,$filename_no_path);
309   
310    # use the one generated by HTMLPlugin, otherwise they all end up with same id.
311    #$self->add_OID($doc_obj);
312    # to here...
313
314    # process it
315    $processor->process($doc_obj);
316    undef $doc_obj;
317    }
318    $self->{'num_processed'} ++;
319
320    # deleted some commented out code here that exists in PowerPointPlugin
321
322    # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created
323    # as we don't know where it was created
324    #$self->clean_up_after_doc_obj_processing();
325
326
327    # if process_status == 1, then the file has been processed.
328    return 1;
329
330}
331
332# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
333sub read_into_doc_obj {
334    my $self = shift (@_);
335    $self->ConvertBinaryFile::deinit(@_);
336}
337
338sub process {
339    my $self = shift (@_);
340    $self->UnknownPlugin::process(@_);
341}
342
343# do we also need a html_multi option to convert_to?
344# move the following, copied from PPT Plugin, into parent ConvertBinaryPlugin, as it's now shared
345sub generate_item_file {
346    my $self = shift(@_);
347    my ($input_filename) = @_;
348    my $outhandle = $self->{'outhandle'};
349    my ($tailname, $dirname, $suffix)
350    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
351
352    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
353
354    # find all the files in the directory
355    if (!opendir (DIR, $dirname)) {
356    print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
357    return $input_filename;
358    }
359
360    my @dir = readdir (DIR);
361    closedir (DIR);
362
363    # start the item file
364    my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
365
366    # encoding specification????
367    if (!open (ITEMFILE, ">$itemfile_name")) {
368    print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
369    }
370    print ITEMFILE "<GeneratedBy>$plugin_name\n";
371    # print the first page
372    my @sorted_dir = sort alphanum_sort @dir;
373    for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
374    my $file = $sorted_dir[$i];
375    if ($file =~ /^img(\d+)\.jpg$/) {
376        my $num = $1;
377        $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
378        print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
379    }
380    }
381    close ITEMFILE;
382    return $itemfile_name;
383
384}
Note: See TracBrowser for help on using the browser.