source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 23484

Last change on this file since 23484 was 23387, checked in by davidb, 13 years ago

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.0 KB
Line 
1###########################################################################
2#
3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29# PostScriptPlugin,
30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31# to either HTML, Text or a series of images. It works by dynamically loading
32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35package ConvertBinaryFile;
36
37use AutoExtractMetadata;
38use ghtml;
39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
45no strict 'subs';
46
47sub BEGIN {
48 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
49}
50
51my $convert_to_list =
52 [ { 'name' => "auto",
53 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
54 { 'name' => "html",
55 'desc' => "{ConvertBinaryFile.convert_to.html}" },
56 { 'name' => "text",
57 'desc' => "{ConvertBinaryFile.convert_to.text}" }
58 ];
59
60my $arguments =
61 [ { 'name' => "convert_to",
62 'desc' => "{ConvertBinaryFile.convert_to}",
63 'type' => "enum",
64 'reqd' => "yes",
65 'list' => $convert_to_list,
66 'deft' => "auto" },
67 { 'name' => "keep_original_filename",
68 'desc' => "{ConvertBinaryFile.keep_original_filename}",
69 'type' => "flag" },
70 { 'name' => "title_sub",
71 'desc' => "{HTMLPlugin.title_sub}",
72 'type' => "string",
73 #'type' => "regexp",
74 'deft' => "" },
75 { 'name' => "apply_fribidi",
76 'desc' => "{ConvertBinaryFile.apply_fribidi}",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "use_strings",
80 'desc' => "{ConvertBinaryFile.use_strings}",
81 'type' => "flag",
82 'reqd' => "no" },
83 ];
84
85my $options = { 'name' => "ConvertBinaryFile",
86 'desc' => "{ConvertBinaryFile.desc}",
87 'abstract' => "yes",
88 'inherits' => "yes",
89 'args' => $arguments };
90
91
92sub load_secondary_plugins
93{
94 my $self = shift (@_);
95 my ($class,$input_args,$hashArgOptLists) = @_;
96
97 my @convert_to_list = split(",",$self->{'convert_to_plugin'});
98 my $secondary_plugins = {};
99 # find the plugin
100
101 foreach my $convert_to (@convert_to_list) {
102 # load in "convert_to" plugin package
103 my $plugin_class = $convert_to;
104 my $plugin_package = $plugin_class.".pm";
105
106 my $colplugname = undef;
107 if (defined $ENV{'GSDLCOLLECTDIR'}) {
108 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109 "perllib","plugins",
110 $plugin_package);
111 }
112
113 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
114 "perllib","plugins",
115 $plugin_package);
116
117 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
118 elsif (-e $mainplugname) { require $mainplugname; }
119 else {
120 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121 $plugin_class);
122 die "\n";
123 }
124
125 # call its constructor with extra options that we've worked out!
126 my $arglist = $input_args->{$plugin_class};
127
128 my ($secondary_plugin);
129 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130 die "$@" if $@;
131 $secondary_plugins->{$plugin_class} = $secondary_plugin;
132 }
133 $self->{'secondary_plugins'} = $secondary_plugins;
134}
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
145
146 return bless $self, $class;
147}
148
149# should be called by subclasses after checking and setting
150# $self->{'convert_to'}
151sub set_standard_convert_settings {
152 my $self =shift (@_);
153
154 my $convert_to = $self->{'convert_to'};
155 if ($convert_to eq "auto") {
156 $convert_to = "html";
157 $self->{'convert_to'} = "html";
158 }
159
160 if ($convert_to =~ /^html/) { # may be html or html_multi
161 $self->{'convert_to_plugin'} = "HTMLPlugin";
162 $self->{'convert_to_ext'} = "html";
163 } elsif ($convert_to eq "text") {
164 $self->{'convert_to_plugin'} = "TextPlugin";
165 $self->{'convert_to_ext'} = "txt";
166 } elsif ($convert_to eq "structuredhtml") {
167 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
168 $self->{'convert_to_ext'} = "html";
169 } elsif ($convert_to =~ /^pagedimg/) {
170 $self->{'convert_to_plugin'} = "PagedImagePlugin";
171 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
172 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
173 $self->{'convert_to_ext'} = $convert_to_ext;
174 }
175
176}
177sub init {
178 my $self = shift (@_);
179 my ($verbosity, $outhandle, $failhandle) = @_;
180
181 $self->SUPER::init($verbosity,$outhandle,$failhandle);
182
183 my $secondary_plugins = $self->{'secondary_plugins'};
184
185 foreach my $plug_name (keys %$secondary_plugins) {
186 my $plugin = $secondary_plugins->{$plug_name};
187 $plugin->init($verbosity,$outhandle,$failhandle);
188 }
189}
190
191sub deinit {
192 # called only once, after all plugin passes have been done
193
194 my ($self) = @_;
195
196 my $secondary_plugins = $self->{'secondary_plugins'};
197
198 foreach my $plug_name (keys %$secondary_plugins) {
199 my $plugin = $secondary_plugins->{$plug_name};
200 $plugin->deinit();
201 }
202}
203
204sub convert_post_process
205{
206 # by default do no post processing
207 return;
208}
209
210
211# Run conversion utility on the input file.
212#
213# The conversion takes place in a collection specific 'tmp' directory so
214# that we don't accidentally damage the input.
215#
216# The desired output type is indicated by $output_ext. This is usually
217# something like "html" or "word", but can be "best" (or the empty string)
218# to indicate that the conversion utility should do the best it can.
219sub tmp_area_convert_file {
220 my $self = shift (@_);
221 my ($output_ext, $input_filename, $textref) = @_;
222
223 my $outhandle = $self->{'outhandle'};
224 my $convert_to = $self->{'convert_to'};
225 my $failhandle = $self->{'failhandle'};
226 my $convert_to_ext = $self->{'convert_to_ext'};
227
228
229 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
230
231 # derive tmp filename from input filename
232 my ($tailname, $dirname, $suffix)
233 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
234
235 # softlink to collection tmp dir
236 my $tmp_dirname = &util::get_timestamped_tmp_folder();
237 if (defined $tmp_dirname) {
238 $self->{'tmp_dir'} = $tmp_dirname;
239 } else {
240 $tmp_dirname = $dirname;
241 }
242
243# # convert to utf-8 otherwise we have problems with the doc.xml file later on
244# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
245
246 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
247 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
248
249
250 # URLEncode this since htmls with images where the html filename is utf8 don't seem
251 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
252 # files on the filesystem.
253 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
254
255 my $lc_suffix = lc($suffix);
256 my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
257
258 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
259 # But we can't softlink to relative paths. Therefore, we need to ensure that
260 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
261 my $ensure_path_absolute = 1; # true
262 &util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
263 my $verbosity = $self->{'verbosity'};
264 if ($verbosity > 0) {
265 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
266 }
267
268 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
269
270 # Execute the conversion command and get the type of the result,
271 # making sure the converter gives us the appropriate output type
272 my $output_type=$self->{'convert_to'};
273# if ($convert_to =~ m/PagedImage/i) {
274# $output_type = lc($convert_to)."_".lc($convert_to_ext);
275# } else {
276# $output_type = lc($convert_to);
277# }
278
279 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
280 if (defined $self->{'convert_options'}) {
281 $cmd .= $self->{'convert_options'} . " ";
282 }
283 if ($self->{'use_strings'}) {
284 $cmd .= "-use_strings ";
285 }
286 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
287 print STDERR "calling cmd $cmd\n";
288 $output_type = `$cmd`;
289
290 # remove symbolic link to original file
291 &util::rm($tmp_filename);
292
293 # Check STDERR here
294 chomp $output_type;
295 if ($output_type eq "fail") {
296 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
297 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
298 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
299 #$self->{'num_not_processed'} ++;
300 if (-s "$errlog") {
301 open(ERRLOG, "$errlog");
302 while (<ERRLOG>) {
303 print $outhandle "$_";
304 }
305 print $outhandle "\n";
306 close ERRLOG;
307 }
308 &util::rm("$errlog") if (-e "$errlog");
309 return "";
310 }
311
312 # store the *actual* output type and return the output filename
313 # it's possible we requested conversion to html, but only to text succeeded
314 #$self->{'convert_to_ext'} = $output_type;
315 if ($output_type =~ /html/i) {
316 $self->{'converted_to'} = "HTML";
317 } elsif ($output_type =~ /te?xt/i) {
318 $self->{'converted_to'} = "Text";
319 } elsif ($output_type =~ /item/i){
320 $self->{'converted_to'} = "PagedImage";
321 }
322
323 my $output_filename = $tmp_filename;
324 if ($output_type =~ /item/i) {
325 # running under windows
326 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
327 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
328 } else {
329 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
330 }
331 } else {
332 $output_filename =~ s/$lc_suffix$/.$output_type/;
333 }
334
335 return $output_filename;
336}
337
338
339# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
340sub read_into_doc_obj {
341 my $self = shift (@_);
342 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
343
344 my $outhandle = $self->{'outhandle'};
345
346 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
347
348 my $output_ext = $self->{'convert_to_ext'};
349 my $conv_filename = "";
350 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
351
352 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
353 if (! -e "$conv_filename") {return -1;}
354 $self->{'conv_filename'} = $conv_filename;
355 $self->convert_post_process($conv_filename);
356
357 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
358 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
359 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
360 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
361 if (system($fribidi_command) != 0) {
362 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
363 }
364 else {
365 &util::mv("${conv_filename}.tmp", $conv_filename);
366 }
367 }
368
369 my $secondary_plugins = $self->{'secondary_plugins'};
370 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
371
372 if ($num_secondary_plugins == 0) {
373 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
374 return 0; # effectively block it
375 }
376
377 my @plugin_names = keys %$secondary_plugins;
378 my $plugin_name = shift @plugin_names;
379
380 if ($num_secondary_plugins > 1) {
381 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
382 }
383
384 my $secondary_plugin = $secondary_plugins->{$plugin_name};
385
386 # note: metadata is not carried on to the next level
387## **** I just replaced $metadata with {} in following
388 my ($rv,$doc_obj)
389 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
390
391 if ((!defined $rv) || ($rv<1)) {
392 # wasn't processed
393 return $rv;
394 }
395
396 # Override previous gsdlsourcefilename set by secondary plugin
397 my $collect_file = &util::filename_within_collection($filename_full_path);
398 my $collect_conv_file = &util::filename_within_collection($conv_filename);
399 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
400 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
401 # build. so set it manually.
402 $doc_obj->set_source_path($filename_full_path);
403 $doc_obj->set_converted_filename($collect_conv_file);
404
405 my $plugin_filename_encoding = $self->{'filename_encoding'};
406 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
407 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
408
409 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
410 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
411
412 # ****
413 my ($tailname, $dirname, $suffix)
414 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
415 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
416
417 # do plugin specific processing of doc_obj
418 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
419 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
420 return -1;
421 }
422
423 my $topsection = $doc_obj->get_top_section();
424 $self->add_associated_files($doc_obj, $filename_full_path);
425
426 # extra_metadata is already called by sec plugin in process??
427 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
428 # do any automatic metadata extraction
429 $self->auto_extract_metadata ($doc_obj);
430
431 # have we found a Title??
432 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
433
434 $self->add_OID($doc_obj);
435
436 return (1, $doc_obj);
437
438}
439
440sub process {
441 my $self = shift (@_);
442 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
443
444 return $self->process_type($base_dir, $file, $doc_obj);
445}
446
447# do plugin specific processing of doc_obj for doc_ext type
448sub process_type {
449 my $self = shift (@_);
450 my ($base_dir, $file, $doc_obj) = @_;
451
452 # need to check that not empty
453 my ($doc_ext) = $file =~ /\.(\w+)$/;
454 $doc_ext = lc($doc_ext);
455 my $file_type = "unknown";
456 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
457
458 # associate original file with doc object
459 my $cursection = $doc_obj->get_top_section();
460 my $filename = &util::filename_cat($base_dir, $file);
461 my $assocfilename = "doc.$doc_ext";
462 if ($self->{'keep_original_filename'} == 1) {
463 # this should be the same filename that was used for the Source and SourceFile metadata,
464 # as we will use SourceFile in the srclink (below)
465 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
466 }
467
468 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
469
470 # We use set instead of add here because we only want one value
471 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
472 my $srclink_filename = "doc.$doc_ext";
473 if ($self->{'keep_original_filename'} == 1) {
474 $srclink_filename = $doc_obj->get_sourcefile();
475 }
476 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
477 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
478 return 1;
479}
480
481sub clean_up_after_doc_obj_processing {
482 my $self = shift(@_);
483
484 my $tmp_dir = $self->{'tmp_dir'};
485 if (defined $tmp_dir && -d $tmp_dir) {
486## print STDERR "**** Supressing clean up of tmp dir\n";
487 &util::rm_r($tmp_dir);
488 $self->{'tmp_dir'} = undef;
489 }
490
491
492}
4931;
494
495
496
497
498
499
500
Note: See TracBrowser for help on using the repository browser.