source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 28489

Last change on this file since 28489 was 28381, checked in by ak19, 11 years ago

Bugfix. When dealing with filenames with special characters that are converted to URL encoding, on Windows ConvertBinaryFile wasn't looking for the new filename but the original one. It needs to be looking for the new filename. Problem noticed with a ppt file sent in to the mailing list.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.4 KB
Line 
1###########################################################################
2#
3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29# PostScriptPlugin,
30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31# to either HTML, Text or a series of images. It works by dynamically loading
32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35package ConvertBinaryFile;
36
37use AutoExtractMetadata;
38use ghtml;
39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
45no strict 'subs';
46use util;
47use FileUtils;
48
49
50sub BEGIN {
51 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
52}
53
54my $convert_to_list =
55 [ { 'name' => "auto",
56 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
57 { 'name' => "html",
58 'desc' => "{ConvertBinaryFile.convert_to.html}" },
59 { 'name' => "text",
60 'desc' => "{ConvertBinaryFile.convert_to.text}" }
61 ];
62
63my $arguments =
64 [ { 'name' => "convert_to",
65 'desc' => "{ConvertBinaryFile.convert_to}",
66 'type' => "enum",
67 'reqd' => "yes",
68 'list' => $convert_to_list,
69 'deft' => "auto" },
70 { 'name' => "keep_original_filename",
71 'desc' => "{ConvertBinaryFile.keep_original_filename}",
72 'type' => "flag" },
73 { 'name' => "title_sub",
74 'desc' => "{HTMLPlugin.title_sub}",
75 'type' => "string",
76 #'type' => "regexp",
77 'deft' => "" },
78 { 'name' => "apply_fribidi",
79 'desc' => "{ConvertBinaryFile.apply_fribidi}",
80 'type' => "flag",
81 'reqd' => "no" },
82 { 'name' => "use_strings",
83 'desc' => "{ConvertBinaryFile.use_strings}",
84 'type' => "flag",
85 'reqd' => "no" },
86 ];
87
88my $options = { 'name' => "ConvertBinaryFile",
89 'desc' => "{ConvertBinaryFile.desc}",
90 'abstract' => "yes",
91 'inherits' => "yes",
92 'args' => $arguments };
93
94
95sub load_secondary_plugins
96{
97 my $self = shift (@_);
98 my ($class,$input_args,$hashArgOptLists) = @_;
99
100 my @convert_to_list = split(",",$self->{'convert_to_plugin'});
101 my $secondary_plugins = {};
102 # find the plugin
103
104 foreach my $convert_to (@convert_to_list) {
105 # load in "convert_to" plugin package
106 my $plugin_class = $convert_to;
107 my $plugin_package = $plugin_class.".pm";
108
109 my $colplugname = undef;
110 if (defined $ENV{'GSDLCOLLECTDIR'}) {
111 $colplugname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},
112 "perllib","plugins",
113 $plugin_package);
114 }
115
116 my $mainplugname = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},
117 "perllib","plugins",
118 $plugin_package);
119
120 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
121 elsif (-e $mainplugname) { require $mainplugname; }
122 else {
123 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
124 $plugin_class);
125 die "\n";
126 }
127
128 # call its constructor with extra options that we've worked out!
129 my $arglist = $input_args->{$plugin_class};
130
131 my ($secondary_plugin);
132 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
133 die "$@" if $@;
134 $secondary_plugins->{$plugin_class} = $secondary_plugin;
135 }
136 $self->{'secondary_plugins'} = $secondary_plugins;
137}
138
139sub new {
140 my ($class) = shift (@_);
141 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
142 push(@$pluginlist, $class);
143 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
144 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145 push(@{$hashArgOptLists->{"OptList"}},$options);
146
147 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
148
149 return bless $self, $class;
150}
151
152# should be called by subclasses after checking and setting
153# $self->{'convert_to'}
154sub set_standard_convert_settings {
155 my $self =shift (@_);
156
157 my $convert_to = $self->{'convert_to'};
158 if ($convert_to eq "auto") {
159 $convert_to = "html";
160 $self->{'convert_to'} = "html";
161 }
162
163 if ($convert_to =~ /^html/) { # may be html or html_multi
164 $self->{'convert_to_plugin'} = "HTMLPlugin";
165 $self->{'convert_to_ext'} = "html";
166 } elsif ($convert_to eq "text") {
167 $self->{'convert_to_plugin'} = "TextPlugin";
168 $self->{'convert_to_ext'} = "txt";
169 } elsif ($convert_to eq "structuredhtml") {
170 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
171 $self->{'convert_to_ext'} = "html";
172 } elsif ($convert_to =~ /^pagedimg/) {
173 $self->{'convert_to_plugin'} = "PagedImagePlugin";
174 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
175 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
176 $self->{'convert_to_ext'} = $convert_to_ext;
177 }
178}
179sub init {
180 my $self = shift (@_);
181 my ($verbosity, $outhandle, $failhandle) = @_;
182
183 $self->SUPER::init($verbosity,$outhandle,$failhandle);
184
185 my $secondary_plugins = $self->{'secondary_plugins'};
186
187 foreach my $plug_name (keys %$secondary_plugins) {
188 my $plugin = $secondary_plugins->{$plug_name};
189 $plugin->init($verbosity,$outhandle,$failhandle);
190 }
191}
192
193sub deinit {
194 # called only once, after all plugin passes have been done
195
196 my ($self) = @_;
197
198 my $secondary_plugins = $self->{'secondary_plugins'};
199
200 foreach my $plug_name (keys %$secondary_plugins) {
201 my $plugin = $secondary_plugins->{$plug_name};
202 $plugin->deinit();
203 }
204}
205
206sub convert_post_process
207{
208 # by default do no post processing
209 return;
210}
211
212
213# Run conversion utility on the input file.
214#
215# The conversion takes place in a collection specific 'tmp' directory so
216# that we don't accidentally damage the input.
217#
218# The desired output type is indicated by $output_ext. This is usually
219# something like "html" or "word", but can be "best" (or the empty string)
220# to indicate that the conversion utility should do the best it can.
221sub tmp_area_convert_file {
222 my $self = shift (@_);
223 my ($output_ext, $input_filename, $textref) = @_;
224
225 my $outhandle = $self->{'outhandle'};
226 my $convert_to = $self->{'convert_to'};
227 my $failhandle = $self->{'failhandle'};
228 my $convert_to_ext = $self->{'convert_to_ext'};
229
230
231 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
232
233 # derive tmp filename from input filename
234 my ($tailname, $dirname, $suffix)
235 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
236
237 # softlink to collection tmp dir
238 my $tmp_dirname = &util::get_timestamped_tmp_folder();
239 if (defined $tmp_dirname) {
240 $self->{'tmp_dir'} = $tmp_dirname;
241 } else {
242 $tmp_dirname = $dirname;
243 }
244
245# # convert to utf-8 otherwise we have problems with the doc.xml file later on
246# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
247
248 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
249 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
250
251
252 # URLEncode this since htmls with images where the html filename is utf8 don't seem
253 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
254 # files on the filesystem.
255 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
256
257 my $lc_suffix = lc($suffix);
258 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
259
260 # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
261 # But we can't softlink to relative paths. Therefore, we need to ensure that
262 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
263 my $ensure_path_absolute = 1; # true
264 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
265 my $verbosity = $self->{'verbosity'};
266 if ($verbosity > 0) {
267 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
268 }
269
270 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
271
272 # Execute the conversion command and get the type of the result,
273 # making sure the converter gives us the appropriate output type
274 my $output_type=$self->{'convert_to'};
275# if ($convert_to =~ m/PagedImage/i) {
276# $output_type = lc($convert_to)."_".lc($convert_to_ext);
277# } else {
278# $output_type = lc($convert_to);
279# }
280
281 my $cmd = "\"".&util::get_perl_exec()."\" -S gsConvert.pl -verbose $verbosity ";
282 if (defined $self->{'convert_options'}) {
283 $cmd .= $self->{'convert_options'} . " ";
284 }
285 if ($self->{'use_strings'}) {
286 $cmd .= "-use_strings ";
287 }
288 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
289 print STDERR "calling cmd $cmd\n";
290 $output_type = `$cmd`;
291
292 # remove symbolic link to original file
293 &FileUtils::removeFiles($tmp_filename);
294
295 # Check STDERR here
296 chomp $output_type;
297 if ($output_type eq "fail") {
298 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
299 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
300 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
301 #$self->{'num_not_processed'} ++;
302 if (-s "$errlog") {
303 open(ERRLOG, "$errlog");
304 while (<ERRLOG>) {
305 print $outhandle "$_";
306 }
307 print $outhandle "\n";
308 close ERRLOG;
309 }
310 &FileUtils::removeFiles("$errlog") if (-e "$errlog");
311 return "";
312 }
313
314 # store the *actual* output type and return the output filename
315 # it's possible we requested conversion to html, but only to text succeeded
316 #$self->{'convert_to_ext'} = $output_type;
317 if ($output_type =~ /html/i) {
318 $self->{'converted_to'} = "HTML";
319 } elsif ($output_type =~ /te?xt/i) {
320 $self->{'converted_to'} = "Text";
321 } elsif ($output_type =~ /item/i){
322 $self->{'converted_to'} = "PagedImage";
323 }
324
325 my $output_filename = $tmp_filename;
326 if ($output_type =~ /item/i) {
327 # running under windows
328 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
329 $output_filename = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname . ".$output_type";
330 } else {
331 $output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type";
332 }
333 } else {
334 $output_filename =~ s/$lc_suffix$/.$output_type/;
335 }
336
337 return $output_filename;
338}
339
340
341# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
342sub read_into_doc_obj {
343 my $self = shift (@_);
344 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
345
346 my $outhandle = $self->{'outhandle'};
347
348 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
349
350 my $output_ext = $self->{'convert_to_ext'};
351 my $conv_filename = "";
352 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
353
354 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
355 if (! -e "$conv_filename") {return -1;}
356 $self->{'conv_filename'} = $conv_filename;
357 $self->convert_post_process($conv_filename);
358
359 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
360 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
361 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
362 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
363 if (system($fribidi_command) != 0) {
364 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
365 }
366 else {
367 &FileUtils::moveFiles("${conv_filename}.tmp", $conv_filename);
368 }
369 }
370
371 my $secondary_plugins = $self->{'secondary_plugins'};
372 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
373
374 if ($num_secondary_plugins == 0) {
375 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
376 return 0; # effectively block it
377 }
378
379 my @plugin_names = keys %$secondary_plugins;
380 my $plugin_name = shift @plugin_names;
381
382 if ($num_secondary_plugins > 1) {
383 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
384 }
385
386 my $secondary_plugin = $secondary_plugins->{$plugin_name};
387
388 # note: metadata is not carried on to the next level
389## **** I just replaced $metadata with {} in following
390 my ($rv,$doc_obj)
391 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
392
393 if ((!defined $rv) || ($rv<1)) {
394 # wasn't processed
395 return $rv;
396 }
397
398 # Override previous gsdlsourcefilename set by secondary plugin
399 my $collect_file = &util::filename_within_collection($filename_full_path);
400 my $collect_conv_file = &util::filename_within_collection($conv_filename);
401 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
402 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
403 # build. so set it manually.
404 $doc_obj->set_source_path($filename_full_path);
405 $doc_obj->set_converted_filename($collect_conv_file);
406
407 my $plugin_filename_encoding = $self->{'filename_encoding'};
408 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
409 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
410
411 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
412 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
413
414 # ****
415 my ($tailname, $dirname, $suffix)
416 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
417 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
418
419 # do plugin specific processing of doc_obj
420 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
421 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
422 return -1;
423 }
424
425 my $topsection = $doc_obj->get_top_section();
426 $self->add_associated_files($doc_obj, $filename_full_path);
427
428 # extra_metadata is already called by sec plugin in process??
429 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
430 # do any automatic metadata extraction
431 $self->auto_extract_metadata ($doc_obj);
432
433 # have we found a Title??
434 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
435
436 # force a new OID - this will use OIDtype option set for this plugin.
437 $self->add_OID($doc_obj, 1);
438
439 return (1, $doc_obj);
440
441}
442
443sub process {
444 my $self = shift (@_);
445 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
446
447 return $self->process_type($base_dir, $file, $doc_obj);
448}
449
450# do plugin specific processing of doc_obj for doc_ext type
451sub process_type {
452 my $self = shift (@_);
453 my ($base_dir, $file, $doc_obj) = @_;
454
455 # need to check that not empty
456 my ($doc_ext) = $file =~ /\.(\w+)$/;
457 $doc_ext = lc($doc_ext);
458 my $file_type = "unknown";
459 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
460
461 # associate original file with doc object
462 my $cursection = $doc_obj->get_top_section();
463 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
464 my $assocfilename = "doc.$doc_ext";
465 if ($self->{'keep_original_filename'} == 1) {
466 # this should be the same filename that was used for the Source and SourceFile metadata,
467 # as we will use SourceFile in the srclink (below)
468 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
469 }
470
471 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
472
473 # We use set instead of add here because we only want one value
474 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
475 my $srclink_filename = "doc.$doc_ext";
476 if ($self->{'keep_original_filename'} == 1) {
477 $srclink_filename = $doc_obj->get_sourcefile();
478 }
479 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
480 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
481 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
482 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $srclink_filename);
483 return 1;
484}
485
486sub clean_up_after_doc_obj_processing {
487 my $self = shift(@_);
488
489 my $tmp_dir = $self->{'tmp_dir'};
490 if (defined $tmp_dir && -d $tmp_dir) {
491 ##print STDERR "**** Suppressing clean up of tmp dir\n";
492 &FileUtils::removeFilesRecursive($tmp_dir);
493 $self->{'tmp_dir'} = undef;
494 }
495
496
497}
4981;
499
500
501
502
503
504
505
Note: See TracBrowser for help on using the repository browser.