source: main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak@ 31759

Last change on this file since 31759 was 31759, checked in by ak19, 3 years ago

The previous commit put text into doc.xml, but no text was visible in when previewing the extracted HTML version of the doc. Kathy pointed out that NoText was set. 'This doc has no text' was also appended to the doc.xml. Overrode add_dummy_text so that

File size: 19.8 KB
Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Support html_multi as output? Then a folder of html files is generated per document? OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
48
49sub BEGIN {
50 @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
51}
52
53my $convert_to_list =
54 [ { 'name' => "text",
55 'desc' => "{ConvertBinaryFile.convert_to.text}" },
56 { 'name' => "html",
57 'desc' => "{ConvertBinaryFile.convert_to.html}" },
58 { 'name' => "pagedimg_jpg",
59 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
60 { 'name' => "pagedimg_gif",
61 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
62 { 'name' => "pagedimg_png",
63 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
64 ];
65
66my $arguments =
67 [ { 'name' => "exec_cmd",
68 'desc' => "{UnknownConverterPlugin.exec_cmd}",
69 'type' => "string",
70 'deft' => "",
71 'reqd' => "yes" },
72 { 'name' => "convert_to",
73 'desc' => "{ConvertBinaryFile.convert_to}",
74 'type' => "enum",
75 'reqd' => "yes",
76 'list' => $convert_to_list,
77 'deft' => "text" },
78 { 'name' => "output_file_or_dir_name",
79 'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}",
80 'type' => "string",
81 'reqd' => "no",
82 'deft' => "" } ];
83
84my $options = { 'name' => "UnknownConverterPlugin",
85 'desc' => "{UnknownConverterPlugin.desc}",
86 'abstract' => "no",
87 'inherits' => "yes",
88 'args' => $arguments };
89
90
91sub new {
92 my ($class) = shift (@_);
93 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
94 push(@$pluginlist, $class);
95
96 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97 push(@{$hashArgOptLists->{"OptList"}},$options);
98
99 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
100 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101
102 # Need to feed the superclass plugins to merge_inheritance() below in the order that the
103 # superclass plugins were declared in the ISA listing earlier in this file:
104 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
105
106 $self = bless $self, $class;
107
108my $outhandle = $self->{'outhandle'};
109 print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
110 if(!defined $self->{'convert_to'}) {
111 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file???????
112 }
113
114 # Convert_To set up, including secondary_plugins for processing the text or html generated
115 # set convert_to_plugin and convert_to_ext
116 $self->set_standard_convert_settings();
117
118 my $secondary_plugin_name = $self->{'convert_to_plugin'};
119 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
120
121 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
122 $secondary_plugin_options->{$secondary_plugin_name} = [];
123 }
124 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
125
126 # using defaults for secondary plugins, taken from RTFPlugin
127 push(@$specific_options, "-file_rename_method", "none");
128 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
129 if ($secondary_plugin_name eq "TextPlugin") {
130 push(@$specific_options, "-input_encoding", "utf8");
131 }
132 elsif ($secondary_plugin_name eq "HTMLPlugin") {
133 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
134 push(@$specific_options, "-processing_tmp_files");
135 }
136 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
137 push(@$specific_options, "-screenviewsize", "1000");
138 push(@$specific_options, "-enable_cache");
139 push(@$specific_options, "-processing_tmp_files");
140 }
141
142 # bless again, copied from PDFPlugin, PowerPointPlugin
143 $self = bless $self, $class;
144 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145 return $self;
146}
147
148# Called by UnknownPlugin::process()
149# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
150# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
151sub add_dummy_text {
152 my $self = shift(@_);
153}
154
155# Are init, begin and deinit necessary (will they not get called automatically)?
156# Copied here from PDFPlugin, PowerPointPlugin
157# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
158# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
159# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
160sub init {
161 my $self = shift (@_);
162
163 # ConvertBinaryFile init
164 $self->ConvertBinaryFile::init(@_);
165}
166
167sub begin {
168 my $self = shift (@_);
169
170 $self->ConvertBinaryFile::begin(@_);
171
172}
173
174sub deinit {
175 my $self = shift (@_);
176
177 $self->ConvertBinaryFile::deinit(@_);
178
179}
180
181# overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
182sub tmp_area_convert_file {
183 # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
184
185 my $self = shift (@_);
186 my ($output_ext, $input_filename, $textref) = @_;
187
188 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
189
190 #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
191 my $outhandle = $self->{'outhandle'};
192 my $convert_to = $self->{'convert_to'};
193 my $failhandle = $self->{'failhandle'};
194 my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings()
195
196
197 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
198
199 # derive tmp filename from input filename
200 my ($tailname, $dirname, $suffix)
201 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
202
203 # softlink to collection tmp dir
204 my $tmp_dirname = &util::get_timestamped_tmp_folder();
205 if (defined $tmp_dirname) {
206 $self->{'tmp_dir'} = $tmp_dirname;
207 } else {
208 $tmp_dirname = $dirname;
209 }
210
211# # convert to utf-8 otherwise we have problems with the doc.xml file later on
212# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
213
214 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
215 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
216
217
218 # URLEncode this since htmls with images where the html filename is utf8 don't seem
219 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
220 # files on the filesystem.
221 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
222
223 my $lc_suffix = lc($suffix);
224 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
225
226 # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
227 # But we can't softlink to relative paths. Therefore, we need to ensure that
228 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
229 my $ensure_path_absolute = 1; # true
230 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
231 my $verbosity = $self->{'verbosity'};
232 if ($verbosity > 0) {
233 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
234 }
235
236 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
237
238
239 my $output_type=$self->{'convert_to'};
240
241 # store the *actual* output type and return the output filename
242 # it's possible we requested conversion to html, but only to text succeeded
243 #$self->{'convert_to_ext'} = $output_type;
244 if ($output_type =~ /html/i) {
245 $self->{'converted_to'} = "HTML";
246 } elsif ($output_type =~ /te?xt/i) {
247 $self->{'converted_to'} = "Text";
248 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
249 $self->{'converted_to'} = "PagedImage";
250 }
251
252 my $output_filename = $tmp_filename;
253 my $output_dirname;
254 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
255 # running under windows
256 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
257 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname;
258 } else {
259 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname;
260 }
261 $output_filename .= ".item";
262 } else {
263 $output_filename =~ s/$lc_suffix$/.$output_type/;
264 }
265
266 #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
267
268 # Execute the conversion command and get the type of the result,
269 # making sure the converter gives us the appropriate output type
270
271 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
272 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
273 # If running the command returns 0, let's assume success and so the act of running the command
274 # should produce either a text file or output to stdout.
275
276 my $cmd = $self->{'exec_cmd'};
277 if(!$cmd) { # empty string for instance
278 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
279 return "";
280 }
281
282 # HARDCODING CMD FOR NOW
283 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\"";
284
285 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html INPUT_FILE OUTPUT";
286
287 # replace occurrences of placeholders in cmd string
288 #$cmd =~ s@\"@\\"@g;
289 $cmd =~ s@INPUT_FILE@\"$input_filename\"@g;
290 if(defined $output_dirname) {
291 $cmd =~ s@OUTPUT@\"$output_dirname\"@g;
292 } else {
293 $cmd =~ s@OUTPUT@\"$output_filename\"@g;
294 }
295
296 print STDERR "@@@@ $plugin_name: executing conversion cmd \n|$cmd|\n";
297 print STDERR " on infile |$input_filename|\n";
298 print STDERR " to produce expected $output_filename\n";
299 my $status = system($cmd);
300
301 if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
302 print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
303 return "";
304 }
305
306 if($status != 0) {
307 print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
308 return "";
309 }
310
311 # remove symbolic link to original file
312 &FileUtils::removeFiles($tmp_filename);
313
314
315 if(defined $output_dirname && -d $output_dirname) {
316 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
317 return "";
318 }
319 elsif (!-e $output_filename) {
320 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
321 return "";
322 }
323
324 # else, conversion success
325
326 # if multiple images were generated by running the conversion
327 if ($self->{'convert_to'} =~ /^pagedimg/) {
328 my $item_filename = $self->generate_item_file($output_filename); #my $item_filename = $self->generate_item_file($output_file_or_dir);
329
330 if (!-e $item_filename) {
331 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
332 return "";
333 }
334 $output_filename = $item_filename;
335 }
336
337 $self->{'output_dirname'} = $output_dirname;
338 $self->{'output_filename'} = $output_filename;
339
340 return $output_filename; #$output_file_or_dir;
341}
342
343# Copied from PowerPointPlugin, with some modifications
344# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
345sub read {
346 my $self = shift (@_);
347 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
348
349 # can we process this file??
350 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
351
352 return undef unless $self->can_process_this_file($filename_full_path);
353
354 my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0;
355
356 # we are only doing something special if we have a directory of html files
357 #if ($is_output_dir || $self->{'convert_to'} ne "html") {
358 if ($self->{'convert_to'} ne "html_multi") {
359 return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
360 }
361 my $outhandle = $self->{'outhandle'};
362 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
363 print $outhandle "$self->{'plugin_type'} processing $file\n"
364 if $self->{'verbosity'} > 1;
365
366 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
367 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
368 if (! -e "$conv_filename") {return -1;}
369
370 my ($tailname, $html_dirname, $suffix)
371 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
372
373 my $collect_file = &util::filename_within_collection($filename_full_path);
374 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
375 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
376
377 my @dir;
378 if (!opendir (DIR, $html_dirname)) {
379 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
380 # just process the original file
381 @dir = ("$tailname.$suffix");
382
383 } else {
384 @dir = readdir (DIR);
385 closedir (DIR);
386 }
387
388 foreach my $file (@dir) {
389 next unless $file =~ /\.html$/;
390
391 my ($rv, $doc_obj) =
392 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
393 if ((!defined $rv) || ($rv<1)) {
394 # wasn't processed
395 return $rv;
396 }
397
398 # next block copied from ConvertBinaryFile
399 # from here ...
400 # Override previous gsdlsourcefilename set by secondary plugin
401
402 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
403 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
404 # build. so set it manually.
405 $doc_obj->set_source_path($filename_full_path);
406 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
407
408 my $plugin_filename_encoding = $self->{'filename_encoding'};
409 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
410 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
411
412 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
413 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
414
415
416 my ($tailname, $dirname, $suffix)
417 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
418 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
419
420
421 my $topsection = $doc_obj->get_top_section();
422 $self->add_associated_files($doc_obj, $filename_full_path);
423
424 # extra_metadata is already called by sec plugin in process??
425 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
426 # do any automatic metadata extraction
427 $self->auto_extract_metadata ($doc_obj);
428
429 # have we found a Title??
430 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
431
432 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
433 #$self->add_OID($doc_obj);
434 # to here...
435
436 # process it
437 $processor->process($doc_obj);
438 undef $doc_obj;
439 }
440 $self->{'num_processed'} ++;
441
442 # deleted some commented out code here that exists in PowerPointPlugin
443
444 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created?
445 # as we don't know where it was created. No. Now creating in tmp.
446 $self->clean_up_after_doc_obj_processing();
447
448
449 # if process_status == 1, then the file has been processed.
450 return 1;
451
452}
453
454# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
455sub read_into_doc_obj {
456 my $self = shift (@_);
457 $self->ConvertBinaryFile::read_into_doc_obj(@_);
458}
459
460sub process {
461 my $self = shift (@_);
462 $self->UnknownPlugin::process(@_);
463}
464
465# do we also need a html_multi option to convert_to?
466# move the following, copied from PPT Plugin, into parent ConvertBinaryPlugin, as it's now shared
467sub generate_item_file {
468 my $self = shift(@_);
469 my ($input_filename) = @_;
470 my $outhandle = $self->{'outhandle'};
471 my ($tailname, $dirname, $suffix)
472 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
473
474 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
475
476 # find all the files in the directory
477 if (!opendir (DIR, $dirname)) {
478 print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
479 return $input_filename;
480 }
481
482 my @dir = readdir (DIR);
483 closedir (DIR);
484
485 # start the item file
486 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
487
488 # encoding specification????
489 if (!open (ITEMFILE, ">$itemfile_name")) {
490 print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
491 }
492 print ITEMFILE "<GeneratedBy>$plugin_name\n";
493 # print the first page
494 my @sorted_dir = sort alphanum_sort @dir;
495 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
496 my $file = $sorted_dir[$i];
497 if ($file =~ /^img(\d+)\.jpg$/) {
498 my $num = $1;
499 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
500 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
501 }
502 }
503 close ITEMFILE;
504 return $itemfile_name;
505
506}
Note: See TracBrowser for help on using the repository browser.