source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 24290

Last change on this file since 24290 was 24290, checked in by sjm84, 13 years ago

Several changes to how Greenstone hashes PDF files and also added several more options to the EmbeddedMetadataPlugin

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1###########################################################################
2#
3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29# PostScriptPlugin,
30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31# to either HTML, Text or a series of images. It works by dynamically loading
32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35package ConvertBinaryFile;
36
37use AutoExtractMetadata;
38use ghtml;
39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
45no strict 'subs';
46use Config; # for getting the perlpath in the recommended way
47
48
49sub BEGIN {
50 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
51}
52
53my $convert_to_list =
54 [ { 'name' => "auto",
55 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
56 { 'name' => "html",
57 'desc' => "{ConvertBinaryFile.convert_to.html}" },
58 { 'name' => "text",
59 'desc' => "{ConvertBinaryFile.convert_to.text}" }
60 ];
61
62my $arguments =
63 [ { 'name' => "convert_to",
64 'desc' => "{ConvertBinaryFile.convert_to}",
65 'type' => "enum",
66 'reqd' => "yes",
67 'list' => $convert_to_list,
68 'deft' => "auto" },
69 { 'name' => "keep_original_filename",
70 'desc' => "{ConvertBinaryFile.keep_original_filename}",
71 'type' => "flag" },
72 { 'name' => "title_sub",
73 'desc' => "{HTMLPlugin.title_sub}",
74 'type' => "string",
75 #'type' => "regexp",
76 'deft' => "" },
77 { 'name' => "apply_fribidi",
78 'desc' => "{ConvertBinaryFile.apply_fribidi}",
79 'type' => "flag",
80 'reqd' => "no" },
81 { 'name' => "use_strings",
82 'desc' => "{ConvertBinaryFile.use_strings}",
83 'type' => "flag",
84 'reqd' => "no" },
85 ];
86
87my $options = { 'name' => "ConvertBinaryFile",
88 'desc' => "{ConvertBinaryFile.desc}",
89 'abstract' => "yes",
90 'inherits' => "yes",
91 'args' => $arguments };
92
93
94sub load_secondary_plugins
95{
96 my $self = shift (@_);
97 my ($class,$input_args,$hashArgOptLists) = @_;
98
99 my @convert_to_list = split(",",$self->{'convert_to_plugin'});
100 my $secondary_plugins = {};
101 # find the plugin
102
103 foreach my $convert_to (@convert_to_list) {
104 # load in "convert_to" plugin package
105 my $plugin_class = $convert_to;
106 my $plugin_package = $plugin_class.".pm";
107
108 my $colplugname = undef;
109 if (defined $ENV{'GSDLCOLLECTDIR'}) {
110 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
111 "perllib","plugins",
112 $plugin_package);
113 }
114
115 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
116 "perllib","plugins",
117 $plugin_package);
118
119 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
120 elsif (-e $mainplugname) { require $mainplugname; }
121 else {
122 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
123 $plugin_class);
124 die "\n";
125 }
126
127 # call its constructor with extra options that we've worked out!
128 my $arglist = $input_args->{$plugin_class};
129
130 my ($secondary_plugin);
131 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
132 die "$@" if $@;
133 $secondary_plugins->{$plugin_class} = $secondary_plugin;
134 }
135 $self->{'secondary_plugins'} = $secondary_plugins;
136}
137
138sub new {
139 my ($class) = shift (@_);
140 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141 push(@$pluginlist, $class);
142 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144 push(@{$hashArgOptLists->{"OptList"}},$options);
145
146 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
147
148 return bless $self, $class;
149}
150
151# should be called by subclasses after checking and setting
152# $self->{'convert_to'}
153sub set_standard_convert_settings {
154 my $self =shift (@_);
155
156 my $convert_to = $self->{'convert_to'};
157 if ($convert_to eq "auto") {
158 $convert_to = "html";
159 $self->{'convert_to'} = "html";
160 }
161
162 if ($convert_to =~ /^html/) { # may be html or html_multi
163 $self->{'convert_to_plugin'} = "HTMLPlugin";
164 $self->{'convert_to_ext'} = "html";
165 } elsif ($convert_to eq "text") {
166 $self->{'convert_to_plugin'} = "TextPlugin";
167 $self->{'convert_to_ext'} = "txt";
168 } elsif ($convert_to eq "structuredhtml") {
169 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
170 $self->{'convert_to_ext'} = "html";
171 } elsif ($convert_to =~ /^pagedimg/) {
172 $self->{'convert_to_plugin'} = "PagedImagePlugin";
173 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
174 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
175 $self->{'convert_to_ext'} = $convert_to_ext;
176 }
177
178}
179sub init {
180 my $self = shift (@_);
181 my ($verbosity, $outhandle, $failhandle) = @_;
182
183 $self->SUPER::init($verbosity,$outhandle,$failhandle);
184
185 my $secondary_plugins = $self->{'secondary_plugins'};
186
187 foreach my $plug_name (keys %$secondary_plugins) {
188 my $plugin = $secondary_plugins->{$plug_name};
189 $plugin->init($verbosity,$outhandle,$failhandle);
190 }
191}
192
193sub deinit {
194 # called only once, after all plugin passes have been done
195
196 my ($self) = @_;
197
198 my $secondary_plugins = $self->{'secondary_plugins'};
199
200 foreach my $plug_name (keys %$secondary_plugins) {
201 my $plugin = $secondary_plugins->{$plug_name};
202 $plugin->deinit();
203 }
204}
205
206sub convert_post_process
207{
208 # by default do no post processing
209 return;
210}
211
212
213# Run conversion utility on the input file.
214#
215# The conversion takes place in a collection specific 'tmp' directory so
216# that we don't accidentally damage the input.
217#
218# The desired output type is indicated by $output_ext. This is usually
219# something like "html" or "word", but can be "best" (or the empty string)
220# to indicate that the conversion utility should do the best it can.
221sub tmp_area_convert_file {
222 my $self = shift (@_);
223 my ($output_ext, $input_filename, $textref) = @_;
224
225 my $outhandle = $self->{'outhandle'};
226 my $convert_to = $self->{'convert_to'};
227 my $failhandle = $self->{'failhandle'};
228 my $convert_to_ext = $self->{'convert_to_ext'};
229
230
231 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
232
233 # derive tmp filename from input filename
234 my ($tailname, $dirname, $suffix)
235 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
236
237 # softlink to collection tmp dir
238 my $tmp_dirname = &util::get_timestamped_tmp_folder();
239 if (defined $tmp_dirname) {
240 $self->{'tmp_dir'} = $tmp_dirname;
241 } else {
242 $tmp_dirname = $dirname;
243 }
244
245# # convert to utf-8 otherwise we have problems with the doc.xml file later on
246# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
247
248 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
249 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
250
251
252 # URLEncode this since htmls with images where the html filename is utf8 don't seem
253 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
254 # files on the filesystem.
255 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
256
257 my $lc_suffix = lc($suffix);
258 my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
259
260 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
261 # But we can't softlink to relative paths. Therefore, we need to ensure that
262 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
263 my $ensure_path_absolute = 1; # true
264 &util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
265 my $verbosity = $self->{'verbosity'};
266 if ($verbosity > 0) {
267 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
268 }
269
270 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
271
272 # Execute the conversion command and get the type of the result,
273 # making sure the converter gives us the appropriate output type
274 my $output_type=$self->{'convert_to'};
275# if ($convert_to =~ m/PagedImage/i) {
276# $output_type = lc($convert_to)."_".lc($convert_to_ext);
277# } else {
278# $output_type = lc($convert_to);
279# }
280
281 my $cmd = "\"$Config{perlpath}\" -S gsConvert.pl -verbose $verbosity ";
282 if (defined $self->{'convert_options'}) {
283 $cmd .= $self->{'convert_options'} . " ";
284 }
285 if ($self->{'use_strings'}) {
286 $cmd .= "-use_strings ";
287 }
288 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
289 print STDERR "calling cmd $cmd\n";
290 $output_type = `$cmd`;
291
292 # remove symbolic link to original file
293 &util::rm($tmp_filename);
294
295 # Check STDERR here
296 chomp $output_type;
297 if ($output_type eq "fail") {
298 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
299 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
300 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
301 #$self->{'num_not_processed'} ++;
302 if (-s "$errlog") {
303 open(ERRLOG, "$errlog");
304 while (<ERRLOG>) {
305 print $outhandle "$_";
306 }
307 print $outhandle "\n";
308 close ERRLOG;
309 }
310 &util::rm("$errlog") if (-e "$errlog");
311 return "";
312 }
313
314 # store the *actual* output type and return the output filename
315 # it's possible we requested conversion to html, but only to text succeeded
316 #$self->{'convert_to_ext'} = $output_type;
317 if ($output_type =~ /html/i) {
318 $self->{'converted_to'} = "HTML";
319 } elsif ($output_type =~ /te?xt/i) {
320 $self->{'converted_to'} = "Text";
321 } elsif ($output_type =~ /item/i){
322 $self->{'converted_to'} = "PagedImage";
323 }
324
325 my $output_filename = $tmp_filename;
326 if ($output_type =~ /item/i) {
327 # running under windows
328 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
329 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
330 } else {
331 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
332 }
333 } else {
334 $output_filename =~ s/$lc_suffix$/.$output_type/;
335 }
336
337 return $output_filename;
338}
339
340
341# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
342sub read_into_doc_obj {
343 my $self = shift (@_);
344 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
345
346 my $outhandle = $self->{'outhandle'};
347
348 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
349
350 my $output_ext = $self->{'convert_to_ext'};
351 my $conv_filename = "";
352 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
353
354 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
355 if (! -e "$conv_filename") {return -1;}
356 $self->{'conv_filename'} = $conv_filename;
357 $self->convert_post_process($conv_filename);
358
359 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
360 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
361 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
362 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
363 if (system($fribidi_command) != 0) {
364 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
365 }
366 else {
367 &util::mv("${conv_filename}.tmp", $conv_filename);
368 }
369 }
370
371 my $secondary_plugins = $self->{'secondary_plugins'};
372 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
373
374 if ($num_secondary_plugins == 0) {
375 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
376 return 0; # effectively block it
377 }
378
379 my @plugin_names = keys %$secondary_plugins;
380 my $plugin_name = shift @plugin_names;
381
382 if ($num_secondary_plugins > 1) {
383 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
384 }
385
386 my $secondary_plugin = $secondary_plugins->{$plugin_name};
387
388 # note: metadata is not carried on to the next level
389## **** I just replaced $metadata with {} in following
390 my ($rv,$doc_obj)
391 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
392
393 if ((!defined $rv) || ($rv<1)) {
394 # wasn't processed
395 return $rv;
396 }
397
398 # Override previous gsdlsourcefilename set by secondary plugin
399 my $collect_file = &util::filename_within_collection($filename_full_path);
400 my $collect_conv_file = &util::filename_within_collection($conv_filename);
401 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
402 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
403 # build. so set it manually.
404 $doc_obj->set_source_path($filename_full_path);
405 $doc_obj->set_converted_filename($collect_conv_file);
406
407 my $plugin_filename_encoding = $self->{'filename_encoding'};
408 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
409 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
410
411 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
412 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
413
414 # ****
415 my ($tailname, $dirname, $suffix)
416 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
417 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
418
419 # do plugin specific processing of doc_obj
420 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
421 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
422 return -1;
423 }
424
425 my $topsection = $doc_obj->get_top_section();
426 $self->add_associated_files($doc_obj, $filename_full_path);
427
428 # extra_metadata is already called by sec plugin in process??
429 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
430 # do any automatic metadata extraction
431 $self->auto_extract_metadata ($doc_obj);
432
433 # have we found a Title??
434 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
435
436 $self->add_OID($doc_obj);
437
438 return (1, $doc_obj);
439
440}
441
442sub process {
443 my $self = shift (@_);
444 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
445
446 return $self->process_type($base_dir, $file, $doc_obj);
447}
448
449# do plugin specific processing of doc_obj for doc_ext type
450sub process_type {
451 my $self = shift (@_);
452 my ($base_dir, $file, $doc_obj) = @_;
453
454 # need to check that not empty
455 my ($doc_ext) = $file =~ /\.(\w+)$/;
456 $doc_ext = lc($doc_ext);
457 my $file_type = "unknown";
458 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
459
460 # associate original file with doc object
461 my $cursection = $doc_obj->get_top_section();
462 my $filename = &util::filename_cat($base_dir, $file);
463 my $assocfilename = "doc.$doc_ext";
464 if ($self->{'keep_original_filename'} == 1) {
465 # this should be the same filename that was used for the Source and SourceFile metadata,
466 # as we will use SourceFile in the srclink (below)
467 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
468 }
469
470 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
471
472 # We use set instead of add here because we only want one value
473 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
474 my $srclink_filename = "doc.$doc_ext";
475 if ($self->{'keep_original_filename'} == 1) {
476 $srclink_filename = $doc_obj->get_sourcefile();
477 }
478 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
479 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
480 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
481 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $srclink_filename);
482 return 1;
483}
484
485sub clean_up_after_doc_obj_processing {
486 my $self = shift(@_);
487
488 my $tmp_dir = $self->{'tmp_dir'};
489 if (defined $tmp_dir && -d $tmp_dir) {
490 print STDERR "**** Supressing clean up of tmp dir\n";
491 ##&util::rm_r($tmp_dir);
492 $self->{'tmp_dir'} = undef;
493 }
494
495
496}
4971;
498
499
500
501
502
503
504
Note: See TracBrowser for help on using the repository browser.