source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 26893

Last change on this file since 26893 was 26893, checked in by kjdon, 11 years ago

ConvertBinaryFile needs to reset the doc OID after all the processing has been done. This will mean it uses the top level plugin OIDtype settings, rather than the secondary plugin ones.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1###########################################################################
2#
3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29# PostScriptPlugin,
30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31# to either HTML, Text or a series of images. It works by dynamically loading
32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35package ConvertBinaryFile;
36
37use AutoExtractMetadata;
38use ghtml;
39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
45no strict 'subs';
46use util;
47
48
49sub BEGIN {
50 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
51}
52
53my $convert_to_list =
54 [ { 'name' => "auto",
55 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
56 { 'name' => "html",
57 'desc' => "{ConvertBinaryFile.convert_to.html}" },
58 { 'name' => "text",
59 'desc' => "{ConvertBinaryFile.convert_to.text}" }
60 ];
61
62my $arguments =
63 [ { 'name' => "convert_to",
64 'desc' => "{ConvertBinaryFile.convert_to}",
65 'type' => "enum",
66 'reqd' => "yes",
67 'list' => $convert_to_list,
68 'deft' => "auto" },
69 { 'name' => "keep_original_filename",
70 'desc' => "{ConvertBinaryFile.keep_original_filename}",
71 'type' => "flag" },
72 { 'name' => "title_sub",
73 'desc' => "{HTMLPlugin.title_sub}",
74 'type' => "string",
75 #'type' => "regexp",
76 'deft' => "" },
77 { 'name' => "apply_fribidi",
78 'desc' => "{ConvertBinaryFile.apply_fribidi}",
79 'type' => "flag",
80 'reqd' => "no" },
81 { 'name' => "use_strings",
82 'desc' => "{ConvertBinaryFile.use_strings}",
83 'type' => "flag",
84 'reqd' => "no" },
85 ];
86
87my $options = { 'name' => "ConvertBinaryFile",
88 'desc' => "{ConvertBinaryFile.desc}",
89 'abstract' => "yes",
90 'inherits' => "yes",
91 'args' => $arguments };
92
93
94sub load_secondary_plugins
95{
96 my $self = shift (@_);
97 my ($class,$input_args,$hashArgOptLists) = @_;
98
99 my @convert_to_list = split(",",$self->{'convert_to_plugin'});
100 my $secondary_plugins = {};
101 # find the plugin
102
103 foreach my $convert_to (@convert_to_list) {
104 # load in "convert_to" plugin package
105 my $plugin_class = $convert_to;
106 my $plugin_package = $plugin_class.".pm";
107
108 my $colplugname = undef;
109 if (defined $ENV{'GSDLCOLLECTDIR'}) {
110 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
111 "perllib","plugins",
112 $plugin_package);
113 }
114
115 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
116 "perllib","plugins",
117 $plugin_package);
118
119 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
120 elsif (-e $mainplugname) { require $mainplugname; }
121 else {
122 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
123 $plugin_class);
124 die "\n";
125 }
126
127 # call its constructor with extra options that we've worked out!
128 my $arglist = $input_args->{$plugin_class};
129
130 my ($secondary_plugin);
131 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
132 die "$@" if $@;
133 $secondary_plugins->{$plugin_class} = $secondary_plugin;
134 }
135 $self->{'secondary_plugins'} = $secondary_plugins;
136}
137
138sub new {
139 my ($class) = shift (@_);
140 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141 push(@$pluginlist, $class);
142 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144 push(@{$hashArgOptLists->{"OptList"}},$options);
145
146 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
147
148 return bless $self, $class;
149}
150
151# should be called by subclasses after checking and setting
152# $self->{'convert_to'}
153sub set_standard_convert_settings {
154 my $self =shift (@_);
155
156 my $convert_to = $self->{'convert_to'};
157 if ($convert_to eq "auto") {
158 $convert_to = "html";
159 $self->{'convert_to'} = "html";
160 }
161
162 if ($convert_to =~ /^html/) { # may be html or html_multi
163 $self->{'convert_to_plugin'} = "HTMLPlugin";
164 $self->{'convert_to_ext'} = "html";
165 } elsif ($convert_to eq "text") {
166 $self->{'convert_to_plugin'} = "TextPlugin";
167 $self->{'convert_to_ext'} = "txt";
168 } elsif ($convert_to eq "structuredhtml") {
169 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
170 $self->{'convert_to_ext'} = "html";
171 } elsif ($convert_to =~ /^pagedimg/) {
172 $self->{'convert_to_plugin'} = "PagedImagePlugin";
173 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
174 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
175 $self->{'convert_to_ext'} = $convert_to_ext;
176 }
177}
178sub init {
179 my $self = shift (@_);
180 my ($verbosity, $outhandle, $failhandle) = @_;
181
182 $self->SUPER::init($verbosity,$outhandle,$failhandle);
183
184 my $secondary_plugins = $self->{'secondary_plugins'};
185
186 foreach my $plug_name (keys %$secondary_plugins) {
187 my $plugin = $secondary_plugins->{$plug_name};
188 $plugin->init($verbosity,$outhandle,$failhandle);
189 }
190}
191
192sub deinit {
193 # called only once, after all plugin passes have been done
194
195 my ($self) = @_;
196
197 my $secondary_plugins = $self->{'secondary_plugins'};
198
199 foreach my $plug_name (keys %$secondary_plugins) {
200 my $plugin = $secondary_plugins->{$plug_name};
201 $plugin->deinit();
202 }
203}
204
205sub convert_post_process
206{
207 # by default do no post processing
208 return;
209}
210
211
212# Run conversion utility on the input file.
213#
214# The conversion takes place in a collection specific 'tmp' directory so
215# that we don't accidentally damage the input.
216#
217# The desired output type is indicated by $output_ext. This is usually
218# something like "html" or "word", but can be "best" (or the empty string)
219# to indicate that the conversion utility should do the best it can.
220sub tmp_area_convert_file {
221 my $self = shift (@_);
222 my ($output_ext, $input_filename, $textref) = @_;
223
224 my $outhandle = $self->{'outhandle'};
225 my $convert_to = $self->{'convert_to'};
226 my $failhandle = $self->{'failhandle'};
227 my $convert_to_ext = $self->{'convert_to_ext'};
228
229
230 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
231
232 # derive tmp filename from input filename
233 my ($tailname, $dirname, $suffix)
234 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
235
236 # softlink to collection tmp dir
237 my $tmp_dirname = &util::get_timestamped_tmp_folder();
238 if (defined $tmp_dirname) {
239 $self->{'tmp_dir'} = $tmp_dirname;
240 } else {
241 $tmp_dirname = $dirname;
242 }
243
244# # convert to utf-8 otherwise we have problems with the doc.xml file later on
245# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
246
247 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
248 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
249
250
251 # URLEncode this since htmls with images where the html filename is utf8 don't seem
252 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
253 # files on the filesystem.
254 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
255
256 my $lc_suffix = lc($suffix);
257 my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
258
259 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
260 # But we can't softlink to relative paths. Therefore, we need to ensure that
261 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
262 my $ensure_path_absolute = 1; # true
263 &util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
264 my $verbosity = $self->{'verbosity'};
265 if ($verbosity > 0) {
266 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
267 }
268
269 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
270
271 # Execute the conversion command and get the type of the result,
272 # making sure the converter gives us the appropriate output type
273 my $output_type=$self->{'convert_to'};
274# if ($convert_to =~ m/PagedImage/i) {
275# $output_type = lc($convert_to)."_".lc($convert_to_ext);
276# } else {
277# $output_type = lc($convert_to);
278# }
279
280 my $cmd = "\"".&util::get_perl_exec()."\" -S gsConvert.pl -verbose $verbosity ";
281 if (defined $self->{'convert_options'}) {
282 $cmd .= $self->{'convert_options'} . " ";
283 }
284 if ($self->{'use_strings'}) {
285 $cmd .= "-use_strings ";
286 }
287 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
288 print STDERR "calling cmd $cmd\n";
289 $output_type = `$cmd`;
290
291 # remove symbolic link to original file
292 &util::rm($tmp_filename);
293
294 # Check STDERR here
295 chomp $output_type;
296 if ($output_type eq "fail") {
297 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
298 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
299 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
300 #$self->{'num_not_processed'} ++;
301 if (-s "$errlog") {
302 open(ERRLOG, "$errlog");
303 while (<ERRLOG>) {
304 print $outhandle "$_";
305 }
306 print $outhandle "\n";
307 close ERRLOG;
308 }
309 &util::rm("$errlog") if (-e "$errlog");
310 return "";
311 }
312
313 # store the *actual* output type and return the output filename
314 # it's possible we requested conversion to html, but only to text succeeded
315 #$self->{'convert_to_ext'} = $output_type;
316 if ($output_type =~ /html/i) {
317 $self->{'converted_to'} = "HTML";
318 } elsif ($output_type =~ /te?xt/i) {
319 $self->{'converted_to'} = "Text";
320 } elsif ($output_type =~ /item/i){
321 $self->{'converted_to'} = "PagedImage";
322 }
323
324 my $output_filename = $tmp_filename;
325 if ($output_type =~ /item/i) {
326 # running under windows
327 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
328 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
329 } else {
330 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
331 }
332 } else {
333 $output_filename =~ s/$lc_suffix$/.$output_type/;
334 }
335
336 return $output_filename;
337}
338
339
340# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
341sub read_into_doc_obj {
342 my $self = shift (@_);
343 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
344
345 my $outhandle = $self->{'outhandle'};
346
347 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
348
349 my $output_ext = $self->{'convert_to_ext'};
350 my $conv_filename = "";
351 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
352
353 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
354 if (! -e "$conv_filename") {return -1;}
355 $self->{'conv_filename'} = $conv_filename;
356 $self->convert_post_process($conv_filename);
357
358 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
359 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
360 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
361 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
362 if (system($fribidi_command) != 0) {
363 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
364 }
365 else {
366 &util::mv("${conv_filename}.tmp", $conv_filename);
367 }
368 }
369
370 my $secondary_plugins = $self->{'secondary_plugins'};
371 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
372
373 if ($num_secondary_plugins == 0) {
374 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
375 return 0; # effectively block it
376 }
377
378 my @plugin_names = keys %$secondary_plugins;
379 my $plugin_name = shift @plugin_names;
380
381 if ($num_secondary_plugins > 1) {
382 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
383 }
384
385 my $secondary_plugin = $secondary_plugins->{$plugin_name};
386
387 # note: metadata is not carried on to the next level
388## **** I just replaced $metadata with {} in following
389 my ($rv,$doc_obj)
390 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
391
392 if ((!defined $rv) || ($rv<1)) {
393 # wasn't processed
394 return $rv;
395 }
396
397 # Override previous gsdlsourcefilename set by secondary plugin
398 my $collect_file = &util::filename_within_collection($filename_full_path);
399 my $collect_conv_file = &util::filename_within_collection($conv_filename);
400 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
401 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
402 # build. so set it manually.
403 $doc_obj->set_source_path($filename_full_path);
404 $doc_obj->set_converted_filename($collect_conv_file);
405
406 my $plugin_filename_encoding = $self->{'filename_encoding'};
407 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
408 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
409
410 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
411 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
412
413 # ****
414 my ($tailname, $dirname, $suffix)
415 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
416 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
417
418 # do plugin specific processing of doc_obj
419 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
420 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
421 return -1;
422 }
423
424 my $topsection = $doc_obj->get_top_section();
425 $self->add_associated_files($doc_obj, $filename_full_path);
426
427 # extra_metadata is already called by sec plugin in process??
428 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
429 # do any automatic metadata extraction
430 $self->auto_extract_metadata ($doc_obj);
431
432 # have we found a Title??
433 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
434
435 # force a new OID - this will use OIDtype option set for this plugin.
436 $self->add_OID($doc_obj, 1);
437
438 return (1, $doc_obj);
439
440}
441
442sub process {
443 my $self = shift (@_);
444 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
445
446 return $self->process_type($base_dir, $file, $doc_obj);
447}
448
449# do plugin specific processing of doc_obj for doc_ext type
450sub process_type {
451 my $self = shift (@_);
452 my ($base_dir, $file, $doc_obj) = @_;
453
454 # need to check that not empty
455 my ($doc_ext) = $file =~ /\.(\w+)$/;
456 $doc_ext = lc($doc_ext);
457 my $file_type = "unknown";
458 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
459
460 # associate original file with doc object
461 my $cursection = $doc_obj->get_top_section();
462 my $filename = &util::filename_cat($base_dir, $file);
463 my $assocfilename = "doc.$doc_ext";
464 if ($self->{'keep_original_filename'} == 1) {
465 # this should be the same filename that was used for the Source and SourceFile metadata,
466 # as we will use SourceFile in the srclink (below)
467 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
468 }
469
470 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
471
472 # We use set instead of add here because we only want one value
473 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
474 my $srclink_filename = "doc.$doc_ext";
475 if ($self->{'keep_original_filename'} == 1) {
476 $srclink_filename = $doc_obj->get_sourcefile();
477 }
478 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
479 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
480 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
481 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $srclink_filename);
482 return 1;
483}
484
485sub clean_up_after_doc_obj_processing {
486 my $self = shift(@_);
487
488 my $tmp_dir = $self->{'tmp_dir'};
489 if (defined $tmp_dir && -d $tmp_dir) {
490 ##print STDERR "**** Suppressing clean up of tmp dir\n";
491 &util::rm_r($tmp_dir);
492 $self->{'tmp_dir'} = undef;
493 }
494
495
496}
4971;
498
499
500
501
502
503
504
Note: See TracBrowser for help on using the repository browser.