source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak@ 31757

Last change on this file since 31757 was 31757, checked in by ak19, 7 years ago

Fixed the earlier problems, which, it turned out, had to do with the order in which the superclass plugin instances were merged to create the subclass plugin. I can run the PDFBox command via UnknownConverterPlugin now at last, but while the text does go into doc.xml, previewing doesn't give me access to the HTML file. Not sure if this is requires fixing up a Formatting statement, or I'm not doing enough in the plugin.

File size: 19.6 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Support html_multi as output? Then a folder of html files is generated per document? OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
48
49sub BEGIN {
50 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
51}
52
53my $convert_to_list =
54 [ { 'name' => "text",
55 'desc' => "{ConvertBinaryFile.convert_to.text}" },
56 { 'name' => "html",
57 'desc' => "{ConvertBinaryFile.convert_to.html}" },
58 { 'name' => "pagedimg_jpg",
59 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
60 { 'name' => "pagedimg_gif",
61 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
62 { 'name' => "pagedimg_png",
63 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
64 ];
65
66my $arguments =
67 [ { 'name' => "exec_cmd",
68 'desc' => "{UnknownConverterPlugin.exec_cmd}",
69 'type' => "string",
70 'deft' => "",
71 'reqd' => "yes" },
72 { 'name' => "convert_to",
73 'desc' => "{ConvertBinaryFile.convert_to}",
74 'type' => "enum",
75 'reqd' => "yes",
76 'list' => $convert_to_list,
77 'deft' => "text" },
78 { 'name' => "output_file_or_dir_name",
79 'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}",
80 'type' => "string",
81 'reqd' => "no",
82 'deft' => "" } ];
83
84my $options = { 'name' => "UnknownConverterPlugin",
85 'desc' => "{UnknownConverterPlugin.desc}",
86 'abstract' => "no",
87 'inherits' => "yes",
88 'args' => $arguments };
89
90
91sub new {
92 my ($class) = shift (@_);
93 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
94 push(@$pluginlist, $class);
95
96 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97 push(@{$hashArgOptLists->{"OptList"}},$options);
98
99 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
100 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101
102 # Need to feed the superclass plugins to merge_inheritance() below in the order that the
103 # superclass plugins were declared in the ISA listing earlier in this file:
104 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
105
106 $self = bless $self, $class;
107
108my $outhandle = $self->{'outhandle'};
109 print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
110 if(!defined $self->{'convert_to'}) {
111 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file???????
112 }
113
114 # Convert_To set up, including secondary_plugins for processing the text or html generated
115 # set convert_to_plugin and convert_to_ext
116 $self->set_standard_convert_settings();
117
118 my $secondary_plugin_name = $self->{'convert_to_plugin'};
119 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
120
121 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
122 $secondary_plugin_options->{$secondary_plugin_name} = [];
123 }
124 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
125
126 # using defaults for secondary plugins, taken from RTFPlugin
127 push(@$specific_options, "-file_rename_method", "none");
128 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
129 if ($secondary_plugin_name eq "TextPlugin") {
130 push(@$specific_options, "-input_encoding", "utf8");
131 }
132 elsif ($secondary_plugin_name eq "HTMLPlugin") {
133 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
134 push(@$specific_options, "-processing_tmp_files");
135 }
136 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
137 push(@$specific_options, "-screenviewsize", "1000");
138 push(@$specific_options, "-enable_cache");
139 push(@$specific_options, "-processing_tmp_files");
140 }
141
142 # bless again, copied from PDFPlugin, PowerPointPlugin
143 $self = bless $self, $class;
144 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145 return $self;
146}
147
148# Are init, begin and deinit necessary (will they not get called automatically)?
149# Copied here from PDFPlugin, PowerPointPlugin
150# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
151# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
152# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
153sub init {
154 my $self = shift (@_);
155
156 # ConvertBinaryFile init
157 $self->ConvertBinaryFile::init(@_);
158}
159
160sub begin {
161 my $self = shift (@_);
162
163 $self->ConvertBinaryFile::begin(@_);
164
165}
166
167sub deinit {
168 my $self = shift (@_);
169
170 $self->ConvertBinaryFile::deinit(@_);
171
172}
173
174# overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
175sub tmp_area_convert_file {
176 # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
177
178 my $self = shift (@_);
179 my ($output_ext, $input_filename, $textref) = @_;
180
181 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
182
183 #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
184 my $outhandle = $self->{'outhandle'};
185 my $convert_to = $self->{'convert_to'};
186 my $failhandle = $self->{'failhandle'};
187 my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings()
188
189
190 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
191
192 # derive tmp filename from input filename
193 my ($tailname, $dirname, $suffix)
194 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
195
196 # softlink to collection tmp dir
197 my $tmp_dirname = &util::get_timestamped_tmp_folder();
198 if (defined $tmp_dirname) {
199 $self->{'tmp_dir'} = $tmp_dirname;
200 } else {
201 $tmp_dirname = $dirname;
202 }
203
204# # convert to utf-8 otherwise we have problems with the doc.xml file later on
205# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
206
207 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
208 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
209
210
211 # URLEncode this since htmls with images where the html filename is utf8 don't seem
212 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
213 # files on the filesystem.
214 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
215
216 my $lc_suffix = lc($suffix);
217 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
218
219 # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
220 # But we can't softlink to relative paths. Therefore, we need to ensure that
221 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
222 my $ensure_path_absolute = 1; # true
223 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
224 my $verbosity = $self->{'verbosity'};
225 if ($verbosity > 0) {
226 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
227 }
228
229 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
230
231
232 my $output_type=$self->{'convert_to'};
233
234 # store the *actual* output type and return the output filename
235 # it's possible we requested conversion to html, but only to text succeeded
236 #$self->{'convert_to_ext'} = $output_type;
237 if ($output_type =~ /html/i) {
238 $self->{'converted_to'} = "HTML";
239 } elsif ($output_type =~ /te?xt/i) {
240 $self->{'converted_to'} = "Text";
241 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
242 $self->{'converted_to'} = "PagedImage";
243 }
244
245 my $output_filename = $tmp_filename;
246 my $output_dirname;
247 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
248 # running under windows
249 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
250 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname;
251 } else {
252 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname;
253 }
254 $output_filename .= ".item";
255 } else {
256 $output_filename =~ s/$lc_suffix$/.$output_type/;
257 }
258
259 #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
260
261 # Execute the conversion command and get the type of the result,
262 # making sure the converter gives us the appropriate output type
263
264 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
265 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
266 # If running the command returns 0, let's assume success and so the act of running the command
267 # should produce either a text file or output to stdout.
268
269 my $cmd = $self->{'exec_cmd'};
270 if(!$cmd) { # empty string for instance
271 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
272 return "";
273 }
274
275 # HARDCODING CMD FOR NOW
276 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\"";
277
278 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html INPUT_FILE OUTPUT";
279
280 # replace occurrences of placeholders in cmd string
281 #$cmd =~ s@\"@\\"@g;
282 $cmd =~ s@INPUT_FILE@\"$input_filename\"@g;
283 if(defined $output_dirname) {
284 $cmd =~ s@OUTPUT@\"$output_dirname\"@g;
285 } else {
286 $cmd =~ s@OUTPUT@\"$output_filename\"@g;
287 }
288
289 print STDERR "@@@@ $plugin_name: executing conversion cmd \n|$cmd|\n";
290 print STDERR " on infile |$input_filename|\n";
291 print STDERR " to produce expected $output_filename\n";
292 my $status = system($cmd);
293
294 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
295 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
296 return "";
297 }
298
299 if($status != 0) {
300 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
301 return "";
302 }
303
304 # remove symbolic link to original file
305 &FileUtils::removeFiles($tmp_filename);
306
307
308 if(defined $output_dirname && -d $output_dirname) {
309 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
310 return "";
311 }
312 elsif (!-e $output_filename) {
313 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
314 return "";
315 }
316
317 # else, conversion success
318
319 # if multiple images were generated by running the conversion
320 if ($self->{'convert_to'} =~ /^pagedimg/) {
321 my $item_filename = $self->generate_item_file($output_filename); #my $item_filename = $self->generate_item_file($output_file_or_dir);
322
323 if (!-e $item_filename) {
324 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
325 return "";
326 }
327 $output_filename = $item_filename;
328 }
329
330 $self->{'output_dirname'} = $output_dirname;
331 $self->{'output_filename'} = $output_filename;
332
333 return $output_filename; #$output_file_or_dir;
334}
335
336# Copied from PowerPointPlugin, with some modifications
337# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
338sub read {
339 my $self = shift (@_);
340 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
341
342 # can we process this file??
343 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
344
345 return undef unless $self->can_process_this_file($filename_full_path);
346
347 my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0;
348
349 # we are only doing something special if we have a directory of html files
350 #if ($is_output_dir || $self->{'convert_to'} ne "html") {
351 if ($self->{'convert_to'} ne "html_multi") {
352 return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
353 }
354 my $outhandle = $self->{'outhandle'};
355 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
356 print $outhandle "$self->{'plugin_type'} processing $file\n"
357 if $self->{'verbosity'} > 1;
358
359 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
360 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
361 if (! -e "$conv_filename") {return -1;}
362
363 my ($tailname, $html_dirname, $suffix)
364 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
365
366 my $collect_file = &util::filename_within_collection($filename_full_path);
367 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
368 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
369
370 my @dir;
371 if (!opendir (DIR, $html_dirname)) {
372 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
373 # just process the original file
374 @dir = ("$tailname.$suffix");
375
376 } else {
377 @dir = readdir (DIR);
378 closedir (DIR);
379 }
380
381 foreach my $file (@dir) {
382 next unless $file =~ /\.html$/;
383
384 my ($rv, $doc_obj) =
385 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
386 if ((!defined $rv) || ($rv<1)) {
387 # wasn't processed
388 return $rv;
389 }
390
391 # next block copied from ConvertBinaryFile
392 # from here ...
393 # Override previous gsdlsourcefilename set by secondary plugin
394
395 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
396 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
397 # build. so set it manually.
398 $doc_obj->set_source_path($filename_full_path);
399 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
400
401 my $plugin_filename_encoding = $self->{'filename_encoding'};
402 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
403 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
404
405 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
406 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
407
408
409 my ($tailname, $dirname, $suffix)
410 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
411 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
412
413
414 my $topsection = $doc_obj->get_top_section();
415 $self->add_associated_files($doc_obj, $filename_full_path);
416
417 # extra_metadata is already called by sec plugin in process??
418 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
419 # do any automatic metadata extraction
420 $self->auto_extract_metadata ($doc_obj);
421
422 # have we found a Title??
423 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
424
425 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
426 #$self->add_OID($doc_obj);
427 # to here...
428
429 # process it
430 $processor->process($doc_obj);
431 undef $doc_obj;
432 }
433 $self->{'num_processed'} ++;
434
435 # deleted some commented out code here that exists in PowerPointPlugin
436
437 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created?
438 # as we don't know where it was created. No. Now creating in tmp.
439 $self->clean_up_after_doc_obj_processing();
440
441
442 # if process_status == 1, then the file has been processed.
443 return 1;
444
445}
446
447# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
448sub read_into_doc_obj {
449 my $self = shift (@_);
450 $self->ConvertBinaryFile::read_into_doc_obj(@_);
451}
452
453sub process {
454 my $self = shift (@_);
455 $self->UnknownPlugin::process(@_);
456}
457
458# do we also need a html_multi option to convert_to?
459# move the following, copied from PPT Plugin, into parent ConvertBinaryPlugin, as it's now shared
460sub generate_item_file {
461 my $self = shift(@_);
462 my ($input_filename) = @_;
463 my $outhandle = $self->{'outhandle'};
464 my ($tailname, $dirname, $suffix)
465 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
466
467 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
468
469 # find all the files in the directory
470 if (!opendir (DIR, $dirname)) {
471 print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
472 return $input_filename;
473 }
474
475 my @dir = readdir (DIR);
476 closedir (DIR);
477
478 # start the item file
479 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
480
481 # encoding specification????
482 if (!open (ITEMFILE, ">$itemfile_name")) {
483 print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
484 }
485 print ITEMFILE "<GeneratedBy>$plugin_name\n";
486 # print the first page
487 my @sorted_dir = sort alphanum_sort @dir;
488 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
489 my $file = $sorted_dir[$i];
490 if ($file =~ /^img(\d+)\.jpg$/) {
491 my $num = $1;
492 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
493 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
494 }
495 }
496 close ITEMFILE;
497 return $itemfile_name;
498
499}
Note: See TracBrowser for help on using the repository browser.