source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak@ 31743

Last change on this file since 31743 was 31743, checked in by ak19, 7 years ago

Committing first attempt at new UnknownConverterPlugin, which hasn't yet been tested. Hence the bak suffix to the file being committed, to prevent the very much untested new plugin from interfering with Greenstone. Next step is to try this plugin out on IceCite to convert PDFs or the djvu conversion tool a member brought up on the mailing list.

File size: 13.5 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35
36# TO DO:
37# - error messages and other display strings need to go into strings.properties
38# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
39# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
40
41sub BEGIN {
42 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
43}
44
45my $convert_to_list =
46 [ { 'name' => "text",
47 'desc' => "{ConvertBinaryFile.convert_to.text}" },
48 { 'name' => "html",
49 'desc' => "{ConvertBinaryFile.convert_to.html}" },
50 { 'name' => "pagedimg_jpg",
51 'desc' => "{{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
52 { 'name' => "pagedimg_gif",
53 'desc' => "{{ConvertBinaryFile.convert_to.pagedimg_gif}" },
54 { 'name' => "pagedimg_png",
55 'desc' => "{{ConvertBinaryFile.convert_to.pagedimg_png}" }
56 ];
57
58my $arguments =
59 [ { 'name' => "exec_cmd",
60 'desc' => "{UnknownConverterPlugin.exec_cmd}",
61 'type' => "string",
62 'deft' => "",
63 'reqd' => "yes" },
64 { 'name' => "convert_to",
65 'desc' => "{ConvertBinaryFile.convert_to}",
66 'type' => "enum",
67 'reqd' => "yes",
68 'list' => $convert_to_list,
69 'deft' => "text" },
70 { 'name' => "output_file_or_dir_name",
71 'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}",
72 'type' => "string",
73 'reqd' => "yes",
74 'deft' => "" } ];
75
76my $options = { 'name' => "UnknownConverterPlugin",
77 'desc' => "{UnknownConverterPlugin.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'args' => $arguments };
81
82
83sub new {
84 my ($class) = shift (@_);
85 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
86 push(@$pluginlist, $class);
87
88 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
89 push(@{$hashArgOptLists->{"OptList"}},$options);
90
91 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
92 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
93 my $self = BaseImporter::merge_inheritance($cbf_self, $unknown_converter_self);
94
95 $self = bless $self, $class;
96
97
98 # Convert_To set up, including secondary_plugins for processing the text or html generated
99 # set convert_to_plugin and convert_to_ext
100 $self->set_standard_convert_settings();
101
102 my $secondary_plugin_name = $self->{'convert_to_plugin'};
103 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
104
105 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
106 $secondary_plugin_options->{$secondary_plugin_name} = [];
107 }
108 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
109
110 # using defaults for secondary plugins, taken from RTFPlugin
111 push(@$specific_options, "-file_rename_method", "none");
112 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
113 if ($secondary_plugin_name eq "TextPlugin") {
114 push(@$specific_options, "-input_encoding", "utf8");
115 }
116 elsif ($secondary_plugin_name eq "HTMLPlugin") {
117 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
118 push(@$specific_options, "-processing_tmp_files");
119 }
120
121 # bless again, copied from PDFPlugin, PowerPointPlugin
122 $self = bless $self, $class;
123 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists;)
124 return $self;
125}
126
127# Are init, begin and deinit necessary (will they not get called automatically)?
128# Copied here from PDFPlugin, PowerPointPlugin
129# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
130# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
131# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
132sub init {
133 my $self = shift (@_);
134
135 # ConvertBinaryFile init
136 $self->ConvertBinaryFile::init(@_);
137}
138
139sub begin {
140 my $self = shift (@_);
141
142 $self->ConvertBinaryFile::begin(@_);
143
144}
145
146sub deinit {
147 my $self = shift (@_);
148
149 $self->ConvertBinaryFile::deinit(@_);
150
151}
152
153# overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
154sub tmp_area_convert_file {
155 # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
156
157 my $self = shift (@_);
158 my ($output_ext, $input_filename, $textref) = @_;
159
160 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
161
162 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
163 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
164 # If running the command returns 0, let's assume success and so the act of running the command
165 # should produce either a text file or output to stdout.
166
167 my $outhandle=$self->{'outhandle'};
168
169 my $cmd = $self->{'exec_cmd'};
170 if(!$cmd) { # empty string for instance
171 print $outhandle "$plugin_name Conversion error: invalid cmd $cmd\n";
172 return "";
173 }
174
175 # replace occurrences of '*' placeholder in cmd string with input filename
176 my ($tailname, $dir, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
177 $cmd ~= s/\*/$tailname/g;
178 print STDERR "@@@@ $plugin_name: executing conversion cmd $cmd\n";
179 my $status = system($cmd);
180
181 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
182 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
183 return "";
184 }
185
186 if($status != 0) {
187 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
188 return "";
189 }
190
191 my $output_file_or_dir = $self->{'output_file_or_dir_name'};
192 if (!-e $output_file_or_dir) {
193 print $outhandle "$plugin_name Conversion error: Output file/dir $output_file_or_dir doesn't exist\n";
194 return "";
195 }
196
197 # else, conversion success
198
199 # if multiple images were generated by running the conversion
200 if ($self->{'convert_to'} eq "pagedimg") {
201 my $item_filename = $self->generate_item_file($output_file_or_dir);
202 return $item_filename;
203 }
204
205 return $output_file_or_dir;
206}
207
208# Copied from PowerPointPlugin, with some modifications
209# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
210sub read {
211 my $self = shift (@_);
212 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
213
214 # can we process this file??
215 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
216
217 return undef unless $self->can_process_this_file($filename_full_path);
218
219 my $output_file_or_dir = $self->{'output_file_or_dir_name'};
220 my $is_output_dir = (-d $output_file_or_dir) ? 1 : 0;
221
222 # we are only doing something special if we have a directory of html files
223 if (!$is_output_dir || $self->{'convert_to'} ne "html") {
224 return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
225 }
226 my $outhandle = $self->{'outhandle'};
227 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
228 print $outhandle "$self->{'plugin_type'} processing $file\n"
229 if $self->{'verbosity'} > 1;
230
231 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
232 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
233 if (! -e "$conv_filename") {return -1;}
234
235 my ($tailname, $html_dirname, $suffix)
236 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
237
238 my $collect_file = &util::filename_within_collection($filename_full_path);
239 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
240 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
241
242 my @dir;
243 if (!opendir (DIR, $html_dirname)) {
244 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
245 # just process the original file
246 @dir = ("$tailname.$suffix");
247
248 } else {
249 @dir = readdir (DIR);
250 closedir (DIR);
251 }
252
253 foreach my $file (@dir) {
254 next unless $file =~ /\.html$/;
255
256 my ($rv, $doc_obj) =
257 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
258 if ((!defined $rv) || ($rv<1)) {
259 # wasn't processed
260 return $rv;
261 }
262
263 # next block copied from ConvertBinaryFile
264 # from here ...
265 # Override previous gsdlsourcefilename set by secondary plugin
266
267 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
268 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
269 # build. so set it manually.
270 $doc_obj->set_source_path($filename_full_path);
271 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
272
273 my $plugin_filename_encoding = $self->{'filename_encoding'};
274 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
275 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
276
277 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
278 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
279
280
281 my ($tailname, $dirname, $suffix)
282 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
283 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
284
285
286 my $topsection = $doc_obj->get_top_section();
287 $self->add_associated_files($doc_obj, $filename_full_path);
288
289 # extra_metadata is already called by sec plugin in process??
290 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
291 # do any automatic metadata extraction
292 $self->auto_extract_metadata ($doc_obj);
293
294 # have we found a Title??
295 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
296
297 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
298 #$self->add_OID($doc_obj);
299 # to here...
300
301 # process it
302 $processor->process($doc_obj);
303 undef $doc_obj;
304 }
305 $self->{'num_processed'} ++;
306
307 # deleted some commented out code here that exists in PowerPointPlugin
308
309 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created
310 # as we don't know where it was created
311 #$self->clean_up_after_doc_obj_processing();
312
313
314 # if process_status == 1, then the file has been processed.
315 return 1;
316
317}
318
319# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
320sub read_into_doc_obj {
321 my $self = shift (@_);
322 $self->ConvertBinaryFile::deinit(@_);
323}
324
325sub process {
326 my $self = shift (@_);
327 $self->UnknownPlugin::process(@_);
328}
329
330# do we also need a html_multi option to convert_to?
331# move the following, copied from PPT Plugin, into parent ConvertBinaryPlugin, as it's now shared
332sub generate_item_file {
333 my $self = shift(@_);
334 my ($input_filename) = @_;
335 my $outhandle = $self->{'outhandle'};
336 my ($tailname, $dirname, $suffix)
337 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
338
339 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
340
341 # find all the files in the directory
342 if (!opendir (DIR, $dirname)) {
343 print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
344 return $input_filename;
345 }
346
347 my @dir = readdir (DIR);
348 closedir (DIR);
349
350 # start the item file
351 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
352
353 # encoding specification????
354 if (!open (ITEMFILE, ">$itemfile_name")) {
355 print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
356 }
357 print ITEMFILE "<GeneratedBy>$plugin_name\n";
358 # print the first page
359 my @sorted_dir = sort alphanum_sort @dir;
360 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
361 my $file = $sorted_dir[$i];
362 if ($file =~ /^img(\d+)\.jpg$/) {
363 my $num = $1;
364 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
365 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
366 }
367 }
368 close ITEMFILE;
369 return $itemfile_name;
370
371}
Note: See TracBrowser for help on using the repository browser.