source: gsdl/trunk/perllib/plugins/ConvertBinaryFile.pm@ 18320

Last change on this file since 18320 was 18320, checked in by ak19, 12 years ago

Now plugins provide the option of base64 encoding or url encoding filenames that are to be renamed (when copied into the archives dir). Previously renamed files would always be url-encoded. URL-encoding is the default now for most plugins except MP3Plugin and OggVorbisPlugin, where the default is base64 encoding. Base64 encoding filenames upon renaming them was introduced so that more files that browsers try to open in external applications can open them, since url encoding does not seem to be implemented the same everywhere (for instance, windows media player is unable to handle url-encoded wmv filenames when such files are launched in it through the browser).

  • Property svn:keywords set to Author Date Id Revision
File size: 17.5 KB
Line 
1###########################################################################
2#
3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29# PostScriptPlugin,
30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31# to either HTML, Text or a series of images. It works by dynamically loading
32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35package ConvertBinaryFile;
36
37use AutoExtractMetadata;
38use ghtml;
39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
45no strict 'subs';
46
47sub BEGIN {
48 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
49}
50
51my $convert_to_list =
52 [ { 'name' => "auto",
53 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
54 { 'name' => "html",
55 'desc' => "{ConvertBinaryFile.convert_to.html}" },
56 { 'name' => "text",
57 'desc' => "{ConvertBinaryFile.convert_to.text}" }
58 ];
59
60my $arguments =
61 [ { 'name' => "convert_to",
62 'desc' => "{ConvertBinaryFile.convert_to}",
63 'type' => "enum",
64 'reqd' => "yes",
65 'list' => $convert_to_list,
66 'deft' => "auto" },
67 { 'name' => "keep_original_filename",
68 'desc' => "{ConvertBinaryFile.keep_original_filename}",
69 'type' => "flag" },
70 { 'name' => "title_sub",
71 'desc' => "{HTMLPlugin.title_sub}",
72 'type' => "string",
73 #'type' => "regexp",
74 'deft' => "" },
75 { 'name' => "apply_fribidi",
76 'desc' => "{ConvertBinaryFile.apply_fribidi}",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "use_strings",
80 'desc' => "{ConvertBinaryFile.use_strings}",
81 'type' => "flag",
82 'reqd' => "no" },
83 ];
84
85my $options = { 'name' => "ConvertBinaryFile",
86 'desc' => "{ConvertBinaryFile.desc}",
87 'abstract' => "yes",
88 'inherits' => "yes",
89 'args' => $arguments };
90
91
92sub load_secondary_plugins
93{
94 my $self = shift (@_);
95 my ($class,$input_args,$hashArgOptLists) = @_;
96
97 my @convert_to_list = split(",",$self->{'convert_to'});
98 my $secondary_plugins = {};
99 # find the plugin
100
101 foreach my $convert_to (@convert_to_list) {
102 # load in "convert_to" plugin package
103 my $plugin_class = $convert_to."Plugin";
104 my $plugin_package = $plugin_class.".pm";
105
106 my $colplugname = undef;
107 if (defined $ENV{'GSDLCOLLECTDIR'}) {
108 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109 "perllib","plugins",
110 $plugin_package);
111 }
112
113 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
114 "perllib","plugins",
115 $plugin_package);
116
117 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
118 elsif (-e $mainplugname) { require $mainplugname; }
119 else {
120 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121 $plugin_class);
122 die "\n";
123 }
124
125 # call its constructor with extra options that we've worked out!
126 my $arglist = $input_args->{$plugin_class};
127
128 my ($secondary_plugin);
129 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130 die "$@" if $@;
131 $secondary_plugins->{$plugin_class} = $secondary_plugin;
132 }
133 $self->{'secondary_plugins'} = $secondary_plugins;
134}
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
145
146 if ($self->{'info_only'}) {
147 # don't worry about any options etc
148 return bless $self, $class;
149 }
150
151 my $convert_to_type = $self->{'convert_to'};
152 if (!defined $convert_to_type || $convert_to_type eq "") {
153 $convert_to_type = "auto";
154 }
155 my $windows_scripting = $self->{'windows_scripting'};
156 $windows_scripting = 0 unless defined $windows_scripting;
157 if ($classPluginName eq "PDFPlugin") {
158 if ($convert_to_type eq "text" &&
159 $ENV{'GSDLOS'} =~ /^windows$/i) {
160 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
161 $convert_to_type = "html";
162 }
163 } elsif ($classPluginName eq "WordPlugin") {
164 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) {
165 # we use structured HTML, not normal html
166 $convert_to_type = "structuredhtml";
167 }
168 } elsif ($classPluginName eq "PowerPointPlugin") {
169 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
170 # we use paged img
171 $convert_to_type = "pagedimg_jpg";
172 }
173 } elsif ($classPluginName eq "PostScriptPlugin") {
174 if ($convert_to_type eq "auto") {
175 # we use text
176 $convert_to_type = "text";
177 }
178 }
179
180 if ($convert_to_type eq "auto") {
181 # choose html for now - should choose a format based on doc type
182 $convert_to_type = "html";
183 }
184
185 if ($convert_to_type eq "html") {
186 $self->{'convert_to'} = "HTML";
187 $self->{'convert_to_ext'} = "html";
188 } elsif ($convert_to_type eq "text") {
189 $self->{'convert_to'} = "Text";
190 $self->{'convert_to_ext'} = "txt";
191 } elsif ($convert_to_type eq "structuredhtml") {
192 $self->{'convert_to'} = "StructuredHTML";
193 $self->{'convert_to_ext'} = "html";
194 } elsif ($convert_to_type =~ /^pagedimg/) {
195 $self->{'convert_to'} = "PagedImage";
196 my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
197 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
198 $self->{'convert_to_ext'} = $convert_to_ext;
199 }
200
201 return bless $self, $class;
202}
203
204
205sub init {
206 my $self = shift (@_);
207 my ($verbosity, $outhandle, $failhandle) = @_;
208
209 $self->SUPER::init($verbosity,$outhandle,$failhandle);
210
211 my $secondary_plugins = $self->{'secondary_plugins'};
212
213 foreach my $plug_name (keys %$secondary_plugins) {
214 my $plugin = $secondary_plugins->{$plug_name};
215 $plugin->init($verbosity,$outhandle,$failhandle);
216 }
217}
218
219sub deinit {
220 # called only once, after all plugin passes have been done
221
222 my ($self) = @_;
223
224 my $secondary_plugins = $self->{'secondary_plugins'};
225
226 foreach my $plug_name (keys %$secondary_plugins) {
227 my $plugin = $secondary_plugins->{$plug_name};
228 $plugin->deinit();
229 }
230}
231
232sub convert_post_process
233{
234 # by default do no post processing
235 return;
236}
237
238
239# Run conversion utility on the input file.
240#
241# The conversion takes place in a collection specific 'tmp' directory so
242# that we don't accidentally damage the input.
243#
244# The desired output type is indicated by $output_ext. This is usually
245# something like "html" or "word", but can be "best" (or the empty string)
246# to indicate that the conversion utility should do the best it can.
247sub tmp_area_convert_file {
248 my $self = shift (@_);
249 my ($output_ext, $input_filename, $textref) = @_;
250
251 my $outhandle = $self->{'outhandle'};
252 my $convert_to = $self->{'convert_to'};
253 my $failhandle = $self->{'failhandle'};
254 my $convert_to_ext = $self->{'convert_to_ext'};
255
256 # derive tmp filename from input filename
257 my ($tailname, $dirname, $suffix)
258 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
259
260 # softlink to collection tmp dir
261 my $tmp_dirname = $dirname;
262 if(defined $ENV{'GSDLCOLLECTDIR'}) {
263 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
264 } elsif(defined $ENV{'GSDLHOME'}) {
265 $tmp_dirname = $ENV{'GSDLHOME'};
266 }
267 $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
268 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
269
270 # The following is not necessary and will cause problems with
271 # replacing_srcdoc_with_html in the GSDLremote case:
272 # Remove any white space from filename -- no risk of name collision, and
273 # makes later conversion by utils simpler. Leave spaces in path...
274 # tidy up the filename with space, dot, hyphen between
275 #$tailname =~ s/\s+//g;
276 #$tailname =~ s/\.+//g;
277 #$tailname =~ s/\-+//g;
278
279 # convert to utf-8 otherwise we have problems with the doc.xml file later on
280# print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
281 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
282
283 # URLEncode this since htmls with images where the html filename is utf8 don't seem
284 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
285 # files on the filesystem.
286 $tailname = &util::rename_file($tailname, $self->{'file_rename_method'});
287
288 $suffix = lc($suffix);
289 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
290
291 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
292 # But we can't softlink to relative paths. Therefore, we need to ensure that
293 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
294 my $ensure_path_absolute = 1; # true
295 &util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
296 my $verbosity = $self->{'verbosity'};
297 if ($verbosity > 0) {
298 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
299 }
300
301 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
302
303 # Execute the conversion command and get the type of the result,
304 # making sure the converter gives us the appropriate output type
305 my $output_type="";
306 if ($convert_to =~ m/PagedImage/i) {
307 $output_type = lc($convert_to)."_".lc($convert_to_ext);
308 } else {
309 $output_type = lc($convert_to);
310 }
311
312 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
313 if (defined $self->{'convert_options'}) {
314 $cmd .= $self->{'convert_options'} . " ";
315 }
316 if ($self->{'use_strings'}) {
317 $cmd .= "-use_strings ";
318 }
319 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
320 $output_type = `$cmd`;
321
322 # remove symbolic link to original file
323 &util::rm($tmp_filename);
324
325 # Check STDERR here
326 chomp $output_type;
327 if ($output_type eq "fail") {
328 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
329 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
330 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
331 #$self->{'num_not_processed'} ++;
332 if (-s "$errlog") {
333 open(ERRLOG, "$errlog");
334 while (<ERRLOG>) {
335 print $outhandle "$_";
336 }
337 print $outhandle "\n";
338 close ERRLOG;
339 }
340 &util::rm("$errlog") if (-e "$errlog");
341 return "";
342 }
343
344 # store the *actual* output type and return the output filename
345 # it's possible we requested conversion to html, but only to text succeeded
346 #$self->{'convert_to_ext'} = $output_type;
347 if ($output_type =~ /html/i) {
348 $self->{'converted_to'} = "HTML";
349 } elsif ($output_type =~ /te?xt/i) {
350 $self->{'converted_to'} = "Text";
351 } elsif ($output_type =~ /item/i){
352 $self->{'converted_to'} = "PagedImage";
353 }
354
355 my $output_filename = $tmp_filename;
356 if ($output_type =~ /item/i) {
357 # running under windows
358 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
359 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
360 } else {
361 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
362 }
363 } else {
364 $output_filename =~ s/$suffix$/.$output_type/;
365 }
366
367 return $output_filename;
368}
369
370
371# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
372sub read_into_doc_obj {
373 my $self = shift (@_);
374 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
375
376 my $outhandle = $self->{'outhandle'};
377
378 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
379
380 my $output_ext = $self->{'convert_to_ext'};
381 my $conv_filename = "";
382 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
383
384 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
385 if (! -e "$conv_filename") {return -1;}
386 $self->{'conv_filename'} = $conv_filename;
387 $self->convert_post_process($conv_filename);
388
389 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
390 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
391 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
392 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
393 if (system($fribidi_command) != 0) {
394 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
395 }
396 else {
397 &util::mv("${conv_filename}.tmp", $conv_filename);
398 }
399 }
400
401 my $secondary_plugins = $self->{'secondary_plugins'};
402 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
403
404 if ($num_secondary_plugins == 0) {
405 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
406 return 0; # effectively block it
407 }
408
409 my @plugin_names = keys %$secondary_plugins;
410 my $plugin_name = shift @plugin_names;
411
412 if ($num_secondary_plugins > 1) {
413 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
414 }
415
416 my $secondary_plugin = $secondary_plugins->{$plugin_name};
417
418 # note: metadata is not carried on to the next level
419## **** I just replaced $metadata with {} in following
420 my ($rv,$doc_obj)
421 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
422
423 if ((!defined $rv) || ($rv<1)) {
424 # wasn't processed
425 return $rv;
426 }
427
428 # Override previous gsdlsourcefilename set by secondary plugin
429 my $collect_file = &util::filename_within_collection($filename_full_path);
430 my $collect_conv_file = &util::filename_within_collection($conv_filename);
431 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
432 $doc_obj->set_converted_filename($collect_conv_file);
433
434 $self->set_Source_metadata($doc_obj, $filename_no_path);
435
436 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
437 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
438
439 # do plugin specific processing of doc_obj
440 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
441 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
442 return -1;
443 }
444
445 my $topsection = $doc_obj->get_top_section();
446 $self->add_associated_files($doc_obj, $filename_full_path);
447
448 # extra_metadata is already called by sec plugin in process??
449 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
450 # do any automatic metadata extraction
451 $self->auto_extract_metadata ($doc_obj);
452
453 # have we found a Title??
454 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
455
456 $self->add_OID($doc_obj);
457
458 return (1, $doc_obj);
459
460}
461
462sub process {
463 my $self = shift (@_);
464 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
465
466 return $self->process_type($base_dir, $file, $doc_obj);
467}
468
469# do plugin specific processing of doc_obj for doc_ext type
470sub process_type {
471 my $self = shift (@_);
472 my ($base_dir, $file, $doc_obj) = @_;
473
474 # need to check that not empty
475 my $doc_ext = $self->{'filename_extension'};
476 my $file_type = "unknown";
477 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
478
479 # associate original file with doc object
480 my $cursection = $doc_obj->get_top_section();
481 my $filename = &util::filename_cat($base_dir, $file);
482 my $assocfilename = "doc.$doc_ext";
483 if ($self->{'keep_original_filename'} == 1) {
484 # this should be the same filename that was used for the Source and SourceFile metadata,
485 # as we will use [SourceFile] in the srclink
486 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
487 }
488 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
489
490 # We use set instead of add here because we only want one value
491 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
492 my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">";
493 if ($self->{'keep_original_filename'} == 1) {
494 $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/[SourceFile]\">";
495 }
496 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
497 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
498 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
499
500 return 1;
501}
502
5031;
504
505
506
507
508
509
510
Note: See TracBrowser for help on using the repository browser.