source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago

Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1###########################################################################
2#
3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29# PostScriptPlugin,
30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31# to either HTML, Text or a series of images. It works by dynamically loading
32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35package ConvertBinaryFile;
36
37use AutoExtractMetadata;
38use ghtml;
39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
45no strict 'subs';
46
47sub BEGIN {
48 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
49}
50
51my $convert_to_list =
52 [ { 'name' => "auto",
53 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
54 { 'name' => "html",
55 'desc' => "{ConvertBinaryFile.convert_to.html}" },
56 { 'name' => "text",
57 'desc' => "{ConvertBinaryFile.convert_to.text}" }
58 ];
59
60my $arguments =
61 [ { 'name' => "convert_to",
62 'desc' => "{ConvertBinaryFile.convert_to}",
63 'type' => "enum",
64 'reqd' => "yes",
65 'list' => $convert_to_list,
66 'deft' => "auto" },
67 { 'name' => "keep_original_filename",
68 'desc' => "{ConvertBinaryFile.keep_original_filename}",
69 'type' => "flag" },
70 { 'name' => "title_sub",
71 'desc' => "{HTMLPlugin.title_sub}",
72 'type' => "string",
73 #'type' => "regexp",
74 'deft' => "" },
75 { 'name' => "apply_fribidi",
76 'desc' => "{ConvertBinaryFile.apply_fribidi}",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "use_strings",
80 'desc' => "{ConvertBinaryFile.use_strings}",
81 'type' => "flag",
82 'reqd' => "no" },
83 ];
84
85my $options = { 'name' => "ConvertBinaryFile",
86 'desc' => "{ConvertBinaryFile.desc}",
87 'abstract' => "yes",
88 'inherits' => "yes",
89 'args' => $arguments };
90
91
92sub load_secondary_plugins
93{
94 my $self = shift (@_);
95 my ($class,$input_args,$hashArgOptLists) = @_;
96
97 my @convert_to_list = split(",",$self->{'convert_to_plugin'});
98 my $secondary_plugins = {};
99 # find the plugin
100
101 foreach my $convert_to (@convert_to_list) {
102 # load in "convert_to" plugin package
103 my $plugin_class = $convert_to;
104 my $plugin_package = $plugin_class.".pm";
105
106 my $colplugname = undef;
107 if (defined $ENV{'GSDLCOLLECTDIR'}) {
108 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109 "perllib","plugins",
110 $plugin_package);
111 }
112
113 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
114 "perllib","plugins",
115 $plugin_package);
116
117 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
118 elsif (-e $mainplugname) { require $mainplugname; }
119 else {
120 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121 $plugin_class);
122 die "\n";
123 }
124
125 # call its constructor with extra options that we've worked out!
126 my $arglist = $input_args->{$plugin_class};
127
128 my ($secondary_plugin);
129 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130 die "$@" if $@;
131 $secondary_plugins->{$plugin_class} = $secondary_plugin;
132 }
133 $self->{'secondary_plugins'} = $secondary_plugins;
134}
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
145
146 return bless $self, $class;
147}
148
149# should be called by subclasses after checking and setting
150# $self->{'convert_to'}
151sub set_standard_convert_settings {
152 my $self =shift (@_);
153
154 my $convert_to = $self->{'convert_to'};
155 if ($convert_to eq "auto") {
156 $convert_to = "html";
157 $self->{'convert_to'} = "html";
158 }
159
160 if ($convert_to =~ /^html/) { # may be html or html_multi
161 $self->{'convert_to_plugin'} = "HTMLPlugin";
162 $self->{'convert_to_ext'} = "html";
163 } elsif ($convert_to eq "text") {
164 $self->{'convert_to_plugin'} = "TextPlugin";
165 $self->{'convert_to_ext'} = "txt";
166 } elsif ($convert_to eq "structuredhtml") {
167 $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
168 $self->{'convert_to_ext'} = "html";
169 } elsif ($convert_to =~ /^pagedimg/) {
170 $self->{'convert_to_plugin'} = "PagedImagePlugin";
171 my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
172 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
173 $self->{'convert_to_ext'} = $convert_to_ext;
174 }
175
176}
177sub init {
178 my $self = shift (@_);
179 my ($verbosity, $outhandle, $failhandle) = @_;
180
181 $self->SUPER::init($verbosity,$outhandle,$failhandle);
182
183 my $secondary_plugins = $self->{'secondary_plugins'};
184
185 foreach my $plug_name (keys %$secondary_plugins) {
186 my $plugin = $secondary_plugins->{$plug_name};
187 $plugin->init($verbosity,$outhandle,$failhandle);
188 }
189}
190
191sub deinit {
192 # called only once, after all plugin passes have been done
193
194 my ($self) = @_;
195
196 my $secondary_plugins = $self->{'secondary_plugins'};
197
198 foreach my $plug_name (keys %$secondary_plugins) {
199 my $plugin = $secondary_plugins->{$plug_name};
200 $plugin->deinit();
201 }
202}
203
204sub convert_post_process
205{
206 # by default do no post processing
207 return;
208}
209
210
211# Run conversion utility on the input file.
212#
213# The conversion takes place in a collection specific 'tmp' directory so
214# that we don't accidentally damage the input.
215#
216# The desired output type is indicated by $output_ext. This is usually
217# something like "html" or "word", but can be "best" (or the empty string)
218# to indicate that the conversion utility should do the best it can.
219sub tmp_area_convert_file {
220 my $self = shift (@_);
221 my ($output_ext, $input_filename, $textref) = @_;
222
223 my $outhandle = $self->{'outhandle'};
224 my $convert_to = $self->{'convert_to'};
225 my $failhandle = $self->{'failhandle'};
226 my $convert_to_ext = $self->{'convert_to_ext'};
227
228 # derive tmp filename from input filename
229 my ($tailname, $dirname, $suffix)
230 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
231
232 # softlink to collection tmp dir
233 my $tmp_dirname = &util::get_timestamped_tmp_folder();
234 if (defined $tmp_dirname) {
235 $self->{'tmp_dir'} = $tmp_dirname;
236 } else {
237 $tmp_dirname = $dirname;
238 }
239
240 # convert to utf-8 otherwise we have problems with the doc.xml file later on
241# print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
242 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
243
244 # URLEncode this since htmls with images where the html filename is utf8 don't seem
245 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
246 # files on the filesystem.
247 $tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
248
249 $suffix = lc($suffix);
250 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
251
252 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
253 # But we can't softlink to relative paths. Therefore, we need to ensure that
254 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
255 my $ensure_path_absolute = 1; # true
256 &util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
257 my $verbosity = $self->{'verbosity'};
258 if ($verbosity > 0) {
259 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
260 }
261
262 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
263
264 # Execute the conversion command and get the type of the result,
265 # making sure the converter gives us the appropriate output type
266 my $output_type=$self->{'convert_to'};
267# if ($convert_to =~ m/PagedImage/i) {
268# $output_type = lc($convert_to)."_".lc($convert_to_ext);
269# } else {
270# $output_type = lc($convert_to);
271# }
272
273 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
274 if (defined $self->{'convert_options'}) {
275 $cmd .= $self->{'convert_options'} . " ";
276 }
277 if ($self->{'use_strings'}) {
278 $cmd .= "-use_strings ";
279 }
280 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
281 print STDERR "calling cmd $cmd\n";
282 $output_type = `$cmd`;
283
284 # remove symbolic link to original file
285 &util::rm($tmp_filename);
286
287 # Check STDERR here
288 chomp $output_type;
289 if ($output_type eq "fail") {
290 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
291 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
292 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
293 #$self->{'num_not_processed'} ++;
294 if (-s "$errlog") {
295 open(ERRLOG, "$errlog");
296 while (<ERRLOG>) {
297 print $outhandle "$_";
298 }
299 print $outhandle "\n";
300 close ERRLOG;
301 }
302 &util::rm("$errlog") if (-e "$errlog");
303 return "";
304 }
305
306 # store the *actual* output type and return the output filename
307 # it's possible we requested conversion to html, but only to text succeeded
308 #$self->{'convert_to_ext'} = $output_type;
309 if ($output_type =~ /html/i) {
310 $self->{'converted_to'} = "HTML";
311 } elsif ($output_type =~ /te?xt/i) {
312 $self->{'converted_to'} = "Text";
313 } elsif ($output_type =~ /item/i){
314 $self->{'converted_to'} = "PagedImage";
315 }
316
317 my $output_filename = $tmp_filename;
318 if ($output_type =~ /item/i) {
319 # running under windows
320 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
321 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
322 } else {
323 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
324 }
325 } else {
326 $output_filename =~ s/$suffix$/.$output_type/;
327 }
328
329 return $output_filename;
330}
331
332
333# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
334sub read_into_doc_obj {
335 my $self = shift (@_);
336 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
337
338 my $outhandle = $self->{'outhandle'};
339
340 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
341
342 my $output_ext = $self->{'convert_to_ext'};
343 my $conv_filename = "";
344 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
345
346 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
347 if (! -e "$conv_filename") {return -1;}
348 $self->{'conv_filename'} = $conv_filename;
349 $self->convert_post_process($conv_filename);
350
351 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
352 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
353 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
354 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
355 if (system($fribidi_command) != 0) {
356 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
357 }
358 else {
359 &util::mv("${conv_filename}.tmp", $conv_filename);
360 }
361 }
362
363 my $secondary_plugins = $self->{'secondary_plugins'};
364 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
365
366 if ($num_secondary_plugins == 0) {
367 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
368 return 0; # effectively block it
369 }
370
371 my @plugin_names = keys %$secondary_plugins;
372 my $plugin_name = shift @plugin_names;
373
374 if ($num_secondary_plugins > 1) {
375 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
376 }
377
378 my $secondary_plugin = $secondary_plugins->{$plugin_name};
379
380 # note: metadata is not carried on to the next level
381## **** I just replaced $metadata with {} in following
382 my ($rv,$doc_obj)
383 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
384
385 if ((!defined $rv) || ($rv<1)) {
386 # wasn't processed
387 return $rv;
388 }
389
390 # Override previous gsdlsourcefilename set by secondary plugin
391 my $collect_file = &util::filename_within_collection($filename_full_path);
392 my $collect_conv_file = &util::filename_within_collection($conv_filename);
393 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
394 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
395 # build. so set it manually.
396 $doc_obj->set_source_path($filename_full_path);
397 $doc_obj->set_converted_filename($collect_conv_file);
398
399 my $plugin_filename_encoding = $self->{'filename_encoding'};
400 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
401 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
402
403 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
404 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
405
406 # ****
407 my ($tailname, $dirname, $suffix)
408 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
409 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
410
411 # do plugin specific processing of doc_obj
412 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
413 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
414 return -1;
415 }
416
417 my $topsection = $doc_obj->get_top_section();
418 $self->add_associated_files($doc_obj, $filename_full_path);
419
420 # extra_metadata is already called by sec plugin in process??
421 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
422 # do any automatic metadata extraction
423 $self->auto_extract_metadata ($doc_obj);
424
425 # have we found a Title??
426 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
427
428 $self->add_OID($doc_obj);
429
430 return (1, $doc_obj);
431
432}
433
434sub process {
435 my $self = shift (@_);
436 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
437
438 return $self->process_type($base_dir, $file, $doc_obj);
439}
440
441# do plugin specific processing of doc_obj for doc_ext type
442sub process_type {
443 my $self = shift (@_);
444 my ($base_dir, $file, $doc_obj) = @_;
445
446 # need to check that not empty
447 my ($doc_ext) = $file =~ /\.(\w+)$/;
448 my $file_type = "unknown";
449 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
450
451 # associate original file with doc object
452 my $cursection = $doc_obj->get_top_section();
453 my $filename = &util::filename_cat($base_dir, $file);
454 my $assocfilename = "doc.$doc_ext";
455 if ($self->{'keep_original_filename'} == 1) {
456 # this should be the same filename that was used for the Source and SourceFile metadata,
457 # as we will use SourceFile in the srclink (below)
458 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
459 }
460 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
461
462 # We use set instead of add here because we only want one value
463 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
464 my $srclink_filename = "doc.$doc_ext";
465 if ($self->{'keep_original_filename'} == 1) {
466 $srclink_filename = $doc_obj->get_sourcefile();
467 }
468 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
469 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
470 return 1;
471}
472
473sub clean_up_after_doc_obj_processing {
474 my $self = shift(@_);
475
476 my $tmp_dir = $self->{'tmp_dir'};
477 if (defined $tmp_dir && -d $tmp_dir) {
478 &util::rm_r($tmp_dir);
479 $self->{'tmp_dir'} = undef;
480 }
481
482
483}
4841;
485
486
487
488
489
490
491
Note: See TracBrowser for help on using the repository browser.