root/main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak @ 31759

Revision 31759, 19.8 KB (checked in by ak19, 2 years ago)

The previous commit put text into doc.xml, but no text was visible in when previewing the extracted HTML version of the doc. Kathy pointed out that NoText? was set. 'This doc has no text' was also appended to the doc.xml. Overrode add_dummy_text so that

Line 
1###########################################################################
2#
3# UnknownConverterPlugin.pm -- plugin that runs the provided cmdline cmd
4# to launch an custom unknown external conversion application that will
5# convert from some custom unknown format to one of txt, html or xml.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package UnknownConverterPlugin;
30
31use strict;
32no strict 'subs';
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ConvertBinaryFile;
36use UnknownPlugin;
37
38# TO DO:
39# - error messages and other display strings need to go into strings.properties
40# - Have a TEMPDIR placeholder in the command, which, if present, gets replaced with the usual tempdir location
41# of a collection, and in which case we have to clean up intermediate files generated in there at the end?
42# Add a check that the generated file or files generated in the output dir match the convert_to option selected
43# before trying to process them
44# Add option that says where output comes from: stdout of the process, file that gets generated, folder.
45# At present, a file or folder of files is assumed.
46# Need to look in there for files with extension process_ext.
47# Support html_multi as output? Then a folder of html files is generated per document? OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
48
49sub BEGIN {
50    @UnknownConverterPlugin::ISA = ('UnknownPlugin', 'ConvertBinaryFile');
51}
52
53my $convert_to_list =
54    [ { 'name' => "text",
55    'desc' => "{ConvertBinaryFile.convert_to.text}" },
56      { 'name' => "html",
57    'desc' => "{ConvertBinaryFile.convert_to.html}" },
58      { 'name' => "pagedimg_jpg",
59    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
60      { 'name' => "pagedimg_gif",
61    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
62      { 'name' => "pagedimg_png",
63    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
64      ];
65
66my $arguments =
67    [ { 'name' => "exec_cmd",
68    'desc' => "{UnknownConverterPlugin.exec_cmd}",
69    'type' => "string",
70    'deft' => "",
71    'reqd' => "yes" },
72      { 'name' => "convert_to",
73    'desc' => "{ConvertBinaryFile.convert_to}",
74    'type' => "enum",
75    'reqd' => "yes",
76    'list' => $convert_to_list,
77    'deft' => "text" },
78      { 'name' => "output_file_or_dir_name",
79    'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}",
80    'type' => "string",
81    'reqd' => "no",
82    'deft' => "" } ];
83
84my $options = { 'name'     => "UnknownConverterPlugin",
85        'desc'     => "{UnknownConverterPlugin.desc}",
86        'abstract' => "no",
87        'inherits' => "yes",
88        'args'     => $arguments };
89
90
91sub new {
92    my ($class) = shift (@_);
93    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
94    push(@$pluginlist, $class);
95
96    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97    push(@{$hashArgOptLists->{"OptList"}},$options);
98
99    my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists);
100    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101   
102    # Need to feed the superclass plugins to merge_inheritance() below in the order that the
103    # superclass plugins were declared in the ISA listing earlier in this file:
104    my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self);
105
106    $self = bless $self, $class;
107
108my $outhandle = $self->{'outhandle'};
109    print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
110    if(!defined $self->{'convert_to'}) {
111    $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file???????
112    }
113
114    # Convert_To set up, including secondary_plugins for processing the text or html generated
115    # set convert_to_plugin and convert_to_ext
116    $self->set_standard_convert_settings();
117
118    my $secondary_plugin_name = $self->{'convert_to_plugin'};
119    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
120
121    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
122    $secondary_plugin_options->{$secondary_plugin_name} = [];
123    }
124    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
125
126    # using defaults for secondary plugins, taken from RTFPlugin
127    push(@$specific_options, "-file_rename_method", "none");
128    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
129    if ($secondary_plugin_name eq "TextPlugin") {
130    push(@$specific_options, "-input_encoding", "utf8");
131    }
132    elsif ($secondary_plugin_name eq "HTMLPlugin") {
133    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
134    push(@$specific_options, "-processing_tmp_files");
135    }
136    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
137    push(@$specific_options, "-screenviewsize", "1000");
138    push(@$specific_options, "-enable_cache");
139    push(@$specific_options, "-processing_tmp_files");
140    }
141
142    # bless again, copied from PDFPlugin, PowerPointPlugin
143    $self = bless $self, $class;
144    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
145    return $self;
146}
147
148# Called by UnknownPlugin::process()
149# Overriding here to ensure that the NoText flag (metadata) and dummy text are not set,
150# since, unlike UnknownPlugin, this plugin has a chance of extracting text from the unknown file format
151sub add_dummy_text {
152    my $self = shift(@_);
153}
154
155# Are init, begin and deinit necessary (will they not get called automatically)?
156# Copied here from PDFPlugin, PowerPointPlugin
157# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
158# "$class->SUPER::new always calls A::new because A comes before B in @ISA. See method resolution order in perlobj: ..."
159# https://stackoverflow.com/questions/15414696/when-using-multiple-inheritance-in-perl-is-there-a-way-to-indicate-which-super-f
160sub init {
161    my $self = shift (@_);
162
163    # ConvertBinaryFile init
164    $self->ConvertBinaryFile::init(@_);
165}
166
167sub begin {
168    my $self = shift (@_);
169
170    $self->ConvertBinaryFile::begin(@_);
171
172}
173
174sub deinit {
175    my $self = shift (@_);
176   
177    $self->ConvertBinaryFile::deinit(@_);
178
179}
180
181# overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
182sub tmp_area_convert_file {
183    # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
184
185    my $self = shift (@_);
186    my ($output_ext, $input_filename, $textref) = @_;
187
188    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
189
190    #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
191    my $outhandle = $self->{'outhandle'};
192    my $convert_to = $self->{'convert_to'};
193    my $failhandle = $self->{'failhandle'};
194    my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings()
195   
196
197    my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
198
199    # derive tmp filename from input filename
200    my ($tailname, $dirname, $suffix)
201    = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
202
203    # softlink to collection tmp dir
204    my $tmp_dirname = &util::get_timestamped_tmp_folder();
205    if (defined $tmp_dirname) {
206    $self->{'tmp_dir'} = $tmp_dirname;
207    } else {
208    $tmp_dirname = $dirname;
209    }
210   
211#    # convert to utf-8 otherwise we have problems with the doc.xml file later on
212#    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
213
214    # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
215     my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
216
217
218    # URLEncode this since htmls with images where the html filename is utf8 don't seem
219    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
220    # files on the filesystem.
221    $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
222
223    my $lc_suffix = lc($suffix);
224    my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
225   
226    # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
227    # But we can't softlink to relative paths. Therefore, we need to ensure that
228    # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
229    my $ensure_path_absolute = 1; # true
230    &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
231    my $verbosity = $self->{'verbosity'};
232    if ($verbosity > 0) {
233    print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
234    }
235
236    my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
237   
238 
239    my $output_type=$self->{'convert_to'};
240
241    # store the *actual* output type and return the output filename
242    # it's possible we requested conversion to html, but only to text succeeded
243    #$self->{'convert_to_ext'} = $output_type;
244    if ($output_type =~ /html/i) {
245    $self->{'converted_to'} = "HTML";
246    } elsif ($output_type =~ /te?xt/i) {
247    $self->{'converted_to'} = "Text";
248    } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){
249    $self->{'converted_to'} = "PagedImage";
250    }
251   
252    my $output_filename = $tmp_filename;
253    my $output_dirname;
254    if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) {
255    # running under windows
256    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
257        $output_dirname = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname;
258    } else {
259        $output_dirname = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname;
260    }
261    $output_filename .= ".item";
262    } else {
263    $output_filename =~ s/$lc_suffix$/.$output_type/;
264    }
265
266    #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
267
268    # Execute the conversion command and get the type of the result,
269    # making sure the converter gives us the appropriate output type
270
271    # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1.
272    # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed.
273    # If running the command returns 0, let's assume success and so the act of running the command
274    # should produce either a text file or output to stdout.
275
276    my $cmd = $self->{'exec_cmd'};
277    if(!$cmd) { # empty string for instance
278    print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n";
279    return "";
280    }
281
282    # HARDCODING CMD FOR NOW
283    #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\"";
284
285    #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html INPUT_FILE OUTPUT";
286
287    # replace occurrences of placeholders in cmd string
288    #$cmd =~ s@\"@\\"@g;
289    $cmd =~ s@INPUT_FILE@\"$input_filename\"@g;
290    if(defined $output_dirname) {
291    $cmd =~ s@OUTPUT@\"$output_dirname\"@g;
292    } else {
293    $cmd =~ s@OUTPUT@\"$output_filename\"@g;
294    }   
295
296    print STDERR "@@@@ $plugin_name: executing conversion cmd \n|$cmd|\n";
297    print STDERR "   on infile |$input_filename|\n";
298    print STDERR "   to produce expected $output_filename\n";
299    my $status = system($cmd);
300
301    if($status == 127 || $status == 9009) { # means the cmd isn't recognised on Unix and Windows, respectively
302    print $outhandle "$plugin_name Conversion error: cmd unrecognised, may not be installed (got $status when running $cmd)\n";
303    return "";
304    }
305
306    if($status != 0) {
307    print $outhandle "$plugin_name Conversion error: conversion failed with exit value $status\n";
308    return "";
309    }
310
311    # remove symbolic link to original file
312    &FileUtils::removeFiles($tmp_filename);
313
314
315    if(defined $output_dirname && -d $output_dirname) {
316    print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n";
317    return "";
318    }
319    elsif (!-e $output_filename) {
320    print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n";
321    return "";
322    }
323
324    # else, conversion success
325   
326    # if multiple images were generated by running the conversion
327    if ($self->{'convert_to'} =~ /^pagedimg/) {
328    my $item_filename = $self->generate_item_file($output_filename); #my $item_filename = $self->generate_item_file($output_file_or_dir);
329
330    if (!-e $item_filename) {
331        print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n";
332        return "";
333    }   
334    $output_filename = $item_filename;
335    }
336
337    $self->{'output_dirname'} = $output_dirname;
338    $self->{'output_filename'} = $output_filename;
339   
340    return $output_filename; #$output_file_or_dir;
341}
342
343# Copied from PowerPointPlugin, with some modifications
344# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
345sub read {
346    my $self = shift (@_); 
347    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
348   
349    # can we process this file??
350    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
351   
352    return undef unless $self->can_process_this_file($filename_full_path);
353   
354    my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0;
355
356    # we are only doing something special if we have a directory of html files
357    #if ($is_output_dir || $self->{'convert_to'} ne "html") {
358    if ($self->{'convert_to'} ne "html_multi") {
359    return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
360    }
361    my $outhandle = $self->{'outhandle'};
362    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
363    print $outhandle "$self->{'plugin_type'} processing $file\n"
364        if $self->{'verbosity'} > 1;
365
366    my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
367    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
368    if (! -e "$conv_filename") {return -1;}
369
370    my ($tailname, $html_dirname, $suffix)
371    = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
372
373    my $collect_file = &util::filename_within_collection($filename_full_path);
374    my $dirname_within_collection = &util::filename_within_collection($html_dirname);
375    my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
376
377    my @dir;
378    if (!opendir (DIR, $html_dirname)) {
379    print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
380    # just process the original file
381    @dir = ("$tailname.$suffix");
382   
383    } else {
384    @dir = readdir (DIR);
385    closedir (DIR);
386    }
387
388    foreach my $file (@dir) {
389    next unless $file =~ /\.html$/;
390   
391    my ($rv, $doc_obj) =
392        $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
393    if ((!defined $rv) || ($rv<1)) {
394        # wasn't processed
395        return $rv;
396    }
397
398    # next block copied from ConvertBinaryFile
399    # from here ...
400    # Override previous gsdlsourcefilename set by secondary plugin
401   
402    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
403    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
404    # build. so set it manually.
405    $doc_obj->set_source_path($filename_full_path);
406    $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
407   
408    my $plugin_filename_encoding = $self->{'filename_encoding'};
409    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
410    $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
411       
412    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
413    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
414
415   
416    my ($tailname, $dirname, $suffix)
417        = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
418    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
419   
420
421    my $topsection = $doc_obj->get_top_section();
422    $self->add_associated_files($doc_obj, $filename_full_path);
423   
424    # extra_metadata is already called by sec plugin in process??
425    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
426    # do any automatic metadata extraction
427    $self->auto_extract_metadata ($doc_obj);
428   
429    # have we found a Title??
430    $self->title_fallback($doc_obj,$topsection,$filename_no_path);
431   
432    # use the one generated by HTMLPlugin, otherwise they all end up with same id.
433    #$self->add_OID($doc_obj);
434    # to here...
435
436    # process it
437    $processor->process($doc_obj);
438    undef $doc_obj;
439    }
440    $self->{'num_processed'} ++;
441
442    # deleted some commented out code here that exists in PowerPointPlugin
443
444    # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created?
445    # as we don't know where it was created. No. Now creating in tmp.
446    $self->clean_up_after_doc_obj_processing();
447
448
449    # if process_status == 1, then the file has been processed.
450    return 1;
451
452}
453
454# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
455sub read_into_doc_obj {
456    my $self = shift (@_);
457    $self->ConvertBinaryFile::read_into_doc_obj(@_);
458}
459
460sub process {
461    my $self = shift (@_);
462    $self->UnknownPlugin::process(@_);
463}
464
465# do we also need a html_multi option to convert_to?
466# move the following, copied from PPT Plugin, into parent ConvertBinaryPlugin, as it's now shared
467sub generate_item_file {
468    my $self = shift(@_);
469    my ($input_filename) = @_;
470    my $outhandle = $self->{'outhandle'};
471    my ($tailname, $dirname, $suffix)
472    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
473
474    my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter
475
476    # find all the files in the directory
477    if (!opendir (DIR, $dirname)) {
478    print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
479    return $input_filename;
480    }
481
482    my @dir = readdir (DIR);
483    closedir (DIR);
484
485    # start the item file
486    my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
487
488    # encoding specification????
489    if (!open (ITEMFILE, ">$itemfile_name")) {
490    print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
491    }
492    print ITEMFILE "<GeneratedBy>$plugin_name\n";
493    # print the first page
494    my @sorted_dir = sort alphanum_sort @dir;
495    for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
496    my $file = $sorted_dir[$i];
497    if ($file =~ /^img(\d+)\.jpg$/) {
498        my $num = $1;
499        $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
500        print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
501    }
502    }
503    close ITEMFILE;
504    return $itemfile_name;
505
506}
Note: See TracBrowser for help on using the browser.