root/main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak @ 31757

Revision 31757, 19.6 KB (checked in by ak19, 3 years ago)

Fixed the earlier problems, which, it turned out, had to do with the order in which the superclass plugin instances were merged to create the subclass plugin. I can run the PDFBox command via UnknownConverterPlugin? now at last, but while the text does go into doc.xml, previewing doesn't give me access to the HTML file. Not sure if this is requires fixing up a Formatting statement, or I'm not doing enough in the plugin.

Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Support html_multi as output? Then a folder of html files is generated per document? OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
48
49sub BEGIN {
50    @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
51}
52
53my $convert_to_list =
54    [ { 'name' => "text",
55    'desc' => "{ConvertBinaryFile.convert_to.text}" },
56      { 'name' => "html",
57    'desc' => "{ConvertBinaryFile.convert_to.html}" },
58      { 'name' => "pagedimg_jpg",
59    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
60      { 'name' => "pagedimg_gif",
61    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
62      { 'name' => "pagedimg_png",
63    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
64      ];
65
66my $arguments =
67    [ { 'name' => "exec_cmd",
68    'desc' => "{UnknownConverterPlugin.exec_cmd}",
69    'type' => "string",
70    'deft' => "",
71    'reqd' => "yes" },
72      { 'name' => "convert_to",
73    'desc' => "{ConvertBinaryFile.convert_to}",
74    'type' => "enum",
75    'reqd' => "yes",
76    'list' => $convert_to_list,
77    'deft' => "text" },
78      { 'name' => "output_file_or_dir_name",
79    'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}",
80    'type' => "string",
81    'reqd' => "no",
82    'deft' => "" } ];
83
84my $options = { 'name'     => "UnknownConverterPlugin",
85        'desc'     => "{UnknownConverterPlugin.desc}",
86        'abstract' => "no",
87        'inherits' => "yes",
88        'args'     => $arguments };
89
90
91sub new {
92    my ($class) = shift (@_);
93    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
94    push(@$pluginlist, $class);
95
96    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97    push(@{$hashArgOptLists->{"OptList"}},$options);
98
99    my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
100    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101   
102    # Need to feed the superclass plugins to merge_inheritance() below in the order that the
103    # superclass plugins were declared in the ISA listing earlier in this file:
104    my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
105
106    $self = bless $self, $class;
107
108my $outhandle = $self->{'outhandle'};
109    print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
110    if(!defined $self->{'convert_to'}) {
111    $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file???????
112    }
113
114    # Convert_To set up, including secondary_plugins for processing the text or html generated
115    # set convert_to_plugin and convert_to_ext
116    $self->set_standard_convert_settings();
117
118    my $secondary_plugin_name = $self->{'convert_to_plugin'};
119    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
120
121    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
122    $secondary_plugin_options->{$secondary_plugin_name} = [];
123    }
124    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
125
126    # using defaults for secondary plugins, taken from RTFPlugin
127    push(@$specific_options, "-file_rename_method", "none");
128    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
129    if ($secondary_plugin_name eq "TextPlugin") {
130    push(@$specific_options, "-input_encoding", "utf8");
131    }
132    elsif ($secondary_plugin_name eq "HTMLPlugin") {
133    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
134    push(@$specific_options, "-processing_tmp_files");
135    }
136    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
137    push(@$specific_options, "-screenviewsize", "1000");
138    push(@$specific_options, "-enable_cache");
139    push(@$specific_options, "-processing_tmp_files");
140    }
141
142    # bless again, copied from PDFPlugin, PowerPointPlugin
143    $self = bless $self, $class;
144    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145    return $self;
146}
147
148# Are init, begin and deinit necessary (will they not get called automatically)?
149# Copied here from PDFPlugin, PowerPointPlugin
150# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
151# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
152# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
153sub init {
154    my $self = shift (@_);
155
156    # ConvertBinaryFile init
157    $self->ConvertBinaryFile::init(@_);
158}
159
160sub begin {
161    my $self = shift (@_);
162
163    $self->ConvertBinaryFile::begin(@_);
164
165}
166
167sub deinit {
168    my $self = shift (@_);
169   
170    $self->ConvertBinaryFile::deinit(@_);
171
172}
173
174# overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
175sub tmp_area_convert_file {
176    # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
177
178    my $self = shift (@_);
179    my ($output_ext, $input_filename, $textref) = @_;
180
181    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
182
183    #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
184    my $outhandle = $self->{'outhandle'};
185    my $convert_to = $self->{'convert_to'};
186    my $failhandle = $self->{'failhandle'};
187    my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings()
188   
189
190    my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
191
192    # derive tmp filename from input filename
193    my ($tailname, $dirname, $suffix)
194    = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
195
196    # softlink to collection tmp dir
197    my $tmp_dirname = &util::get_timestamped_tmp_folder();
198    if (defined $tmp_dirname) {
199    $self->{'tmp_dir'} = $tmp_dirname;
200    } else {
201    $tmp_dirname = $dirname;
202    }
203   
204#    # convert to utf-8 otherwise we have problems with the doc.xml file later on
205#    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
206
207    # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
208     my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
209
210
211    # URLEncode this since htmls with images where the html filename is utf8 don't seem
212    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
213    # files on the filesystem.
214    $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
215
216    my $lc_suffix = lc($suffix);
217    my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
218   
219    # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
220    # But we can't softlink to relative paths. Therefore, we need to ensure that
221    # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
222    my $ensure_path_absolute = 1; # true
223    &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
224    my $verbosity = $self->{'verbosity'};
225    if ($verbosity > 0) {
226    print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
227    }
228
229    my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
230   
231 
232    my $output_type=$self->{'convert_to'};
233
234    # store the *actual* output type and return the output filename
235    # it's possible we requested conversion to html, but only to text succeeded
236    #$self->{'convert_to_ext'} = $output_type;
237    if ($output_type =~ /html/i) {
238    $self->{'converted_to'} = "HTML";
239    } elsif ($output_type =~ /te?xt/i) {
240    $self->{'converted_to'} = "Text";
241    } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
242    $self->{'converted_to'} = "PagedImage";
243    }
244   
245    my $output_filename = $tmp_filename;
246    my $output_dirname;
247    if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
248    # running under windows
249    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
250        $output_dirname = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname;
251    } else {
252        $output_dirname = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname;
253    }
254    $output_filename .= ".item";
255    } else {
256    $output_filename =~ s/$lc_suffix$/.$output_type/;
257    }
258
259    #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
260
261    # Execute the conversion command and get the type of the result,
262    # making sure the converter gives us the appropriate output type
263
264    # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
265    # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
266    # If running the command returns 0, let's assume success and so the act of running the command
267    # should produce either a text file or output to stdout.
268
269    my $cmd = $self->{'exec_cmd'};
270    if(!$cmd) { # empty string for instance
271    print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
272    return "";
273    }
274
275    # HARDCODING CMD FOR NOW
276    #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\"";
277
278    #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html INPUT_FILE OUTPUT";
279
280    # replace occurrences of placeholders in cmd string
281    #$cmd =~ s@\"@\\"@g;
282    $cmd =~ s@INPUT_FILE@\"$input_filename\"@g;
283    if(defined $output_dirname) {
284    $cmd =~ s@OUTPUT@\"$output_dirname\"@g;
285    } else {
286    $cmd =~ s@OUTPUT@\"$output_filename\"@g;
287    }   
288
289    print STDERR "@@@@ $plugin_name: executing conversion cmd \n|$cmd|\n";
290    print STDERR "   on infile |$input_filename|\n";
291    print STDERR "   to produce expected $output_filename\n";
292    my $status = system($cmd);
293
294    if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
295    print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
296    return "";
297    }
298
299    if($status != 0) {
300    print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
301    return "";
302    }
303
304    # remove symbolic link to original file
305    &FileUtils::removeFiles($tmp_filename);
306
307
308    if(defined $output_dirname && -d $output_dirname) {
309    print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
310    return "";
311    }
312    elsif (!-e $output_filename) {
313    print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
314    return "";
315    }
316
317    # else, conversion success
318   
319    # if multiple images were generated by running the conversion
320    if ($self->{'convert_to'} =~ /^pagedimg/) {
321    my $item_filename = $self->generate_item_file($output_filename); #my $item_filename = $self->generate_item_file($output_file_or_dir);
322
323    if (!-e $item_filename) {
324        print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
325        return "";
326    }   
327    $output_filename = $item_filename;
328    }
329
330    $self->{'output_dirname'} = $output_dirname;
331    $self->{'output_filename'} = $output_filename;
332   
333    return $output_filename; #$output_file_or_dir;
334}
335
336# Copied from PowerPointPlugin, with some modifications
337# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
338sub read {
339    my $self = shift (@_); 
340    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
341   
342    # can we process this file??
343    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
344   
345    return undef unless $self->can_process_this_file($filename_full_path);
346   
347    my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0;
348
349    # we are only doing something special if we have a directory of html files
350    #if ($is_output_dir || $self->{'convert_to'} ne "html") {
351    if ($self->{'convert_to'} ne "html_multi") {
352    return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
353    }
354    my $outhandle = $self->{'outhandle'};
355    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
356    print $outhandle "$self->{'plugin_type'} processing $file\n"
357        if $self->{'verbosity'} > 1;
358
359    my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
360    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
361    if (! -e "$conv_filename") {return -1;}
362
363    my ($tailname, $html_dirname, $suffix)
364    = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
365
366    my $collect_file = &util::filename_within_collection($filename_full_path);
367    my $dirname_within_collection = &util::filename_within_collection($html_dirname);
368    my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
369
370    my @dir;
371    if (!opendir (DIR, $html_dirname)) {
372    print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
373    # just process the original file
374    @dir = ("$tailname.$suffix");
375   
376    } else {
377    @dir = readdir (DIR);
378    closedir (DIR);
379    }
380
381    foreach my $file (@dir) {
382    next unless $file =~ /\.html$/;
383   
384    my ($rv, $doc_obj) =
385        $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
386    if ((!defined $rv) || ($rv<1)) {
387        # wasn't processed
388        return $rv;
389    }
390
391    # next block copied from ConvertBinaryFile
392    # from here ...
393    # Override previous gsdlsourcefilename set by secondary plugin
394   
395    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
396    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
397    # build. so set it manually.
398    $doc_obj->set_source_path($filename_full_path);
399    $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
400   
401    my $plugin_filename_encoding = $self->{'filename_encoding'};
402    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
403    $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
404       
405    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
406    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
407
408   
409    my ($tailname, $dirname, $suffix)
410        = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
411    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
412   
413
414    my $topsection = $doc_obj->get_top_section();
415    $self->add_associated_files($doc_obj, $filename_full_path);
416   
417    # extra_metadata is already called by sec plugin in process??
418    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
419    # do any automatic metadata extraction
420    $self->auto_extract_metadata ($doc_obj);
421   
422    # have we found a Title??
423    $self->title_fallback($doc_obj,$topsection,$filename_no_path);
424   
425    # use the one generated by HTMLPlugin, otherwise they all end up with same id.
426    #$self->add_OID($doc_obj);
427    # to here...
428
429    # process it
430    $processor->process($doc_obj);
431    undef $doc_obj;
432    }
433    $self->{'num_processed'} ++;
434
435    # deleted some commented out code here that exists in PowerPointPlugin
436
437    # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created?
438    # as we don't know where it was created. No. Now creating in tmp.
439    $self->clean_up_after_doc_obj_processing();
440
441
442    # if process_status == 1, then the file has been processed.
443    return 1;
444
445}
446
447# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
448sub read_into_doc_obj {
449    my $self = shift (@_);
450    $self->ConvertBinaryFile::read_into_doc_obj(@_);
451}
452
453sub process {
454    my $self = shift (@_);
455    $self->UnknownPlugin::process(@_);
456}
457
458# do we also need a html_multi option to convert_to?
459# move the following, copied from PPT Plugin, into parent ConvertBinaryPlugin, as it's now shared
460sub generate_item_file {
461    my $self = shift(@_);
462    my ($input_filename) = @_;
463    my $outhandle = $self->{'outhandle'};
464    my ($tailname, $dirname, $suffix)
465    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
466
467    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
468
469    # find all the files in the directory
470    if (!opendir (DIR, $dirname)) {
471    print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
472    return $input_filename;
473    }
474
475    my @dir = readdir (DIR);
476    closedir (DIR);
477
478    # start the item file
479    my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
480
481    # encoding specification????
482    if (!open (ITEMFILE, ">$itemfile_name")) {
483    print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
484    }
485    print ITEMFILE "<GeneratedBy>$plugin_name\n";
486    # print the first page
487    my @sorted_dir = sort alphanum_sort @dir;
488    for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
489    my $file = $sorted_dir[$i];
490    if ($file =~ /^img(\d+)\.jpg$/) {
491        my $num = $1;
492        $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
493        print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
494    }
495    }
496    close ITEMFILE;
497    return $itemfile_name;
498
499}
Note: See TracBrowser for help on using the browser.