source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 21760

Last change on this file since 21760 was 21760, checked in by kjdon, 14 years ago

srclink now generated dynamically at runtime. instead of storing srclink metadata, we store srclink_file metadata, which can be a value (doc.doc) or a metadata format element (eg [SourceFile]).

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
RevLine 
[10450]1###########################################################################
2#
[15906]3# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4# through gsConvert.pl
[10450]5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[17723]28# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
[17726]29# PostScriptPlugin,
[17723]30# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
[15871]31# to either HTML, Text or a series of images. It works by dynamically loading
[12741]32# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
[15871]33# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
[12741]34
[15871]35package ConvertBinaryFile;
[10450]36
[15906]37use AutoExtractMetadata;
[10450]38use ghtml;
[15871]39use HTMLPlugin;
40use TextPlugin;
41use PagedImagePlugin;
[10450]42
[10453]43use strict;
44no strict 'refs'; # allow filehandles to be variables and viceversa
[11680]45no strict 'subs';
[15871]46
[10450]47sub BEGIN {
[15906]48 @ConvertBinaryFile::ISA = ('AutoExtractMetadata');
[10450]49}
50
51my $convert_to_list =
52 [ { 'name' => "auto",
[15871]53 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10450]54 { 'name' => "html",
[15871]55 'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10450]56 { 'name' => "text",
[15871]57 'desc' => "{ConvertBinaryFile.convert_to.text}" }
[10450]58 ];
59
60my $arguments =
61 [ { 'name' => "convert_to",
[15871]62 'desc' => "{ConvertBinaryFile.convert_to}",
[10450]63 'type' => "enum",
64 'reqd' => "yes",
65 'list' => $convert_to_list,
[10890]66 'deft' => "auto" },
[12961]67 { 'name' => "keep_original_filename",
[15871]68 'desc' => "{ConvertBinaryFile.keep_original_filename}",
[12961]69 'type' => "flag" },
[10450]70 { 'name' => "title_sub",
[16013]71 'desc' => "{HTMLPlugin.title_sub}",
[10450]72 'type' => "string",
73 #'type' => "regexp",
74 'deft' => "" },
[11008]75 { 'name' => "apply_fribidi",
[15871]76 'desc' => "{ConvertBinaryFile.apply_fribidi}",
[11008]77 'type' => "flag",
78 'reqd' => "no" },
[10450]79 { 'name' => "use_strings",
[15871]80 'desc' => "{ConvertBinaryFile.use_strings}",
[10450]81 'type' => "flag",
82 'reqd' => "no" },
[15871]83 ];
[10450]84
[15871]85my $options = { 'name' => "ConvertBinaryFile",
86 'desc' => "{ConvertBinaryFile.desc}",
[10450]87 'abstract' => "yes",
88 'inherits' => "yes",
89 'args' => $arguments };
90
91
92sub load_secondary_plugins
93{
94 my $self = shift (@_);
95 my ($class,$input_args,$hashArgOptLists) = @_;
96
97 my @convert_to_list = split(",",$self->{'convert_to'});
[10453]98 my $secondary_plugins = {};
[11680]99 # find the plugin
[10450]100
101 foreach my $convert_to (@convert_to_list) {
102 # load in "convert_to" plugin package
[15871]103 my $plugin_class = $convert_to."Plugin";
[10450]104 my $plugin_package = $plugin_class.".pm";
105
[15116]106 my $colplugname = undef;
107 if (defined $ENV{'GSDLCOLLECTDIR'}) {
108 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109 "perllib","plugins",
110 $plugin_package);
111 }
112
[11680]113 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
[15116]114 "perllib","plugins",
[11680]115 $plugin_package);
[10450]116
[15116]117 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
[11680]118 elsif (-e $mainplugname) { require $mainplugname; }
119 else {
120 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121 $plugin_class);
122 die "\n";
123 }
124
[10450]125 # call its constructor with extra options that we've worked out!
126 my $arglist = $input_args->{$plugin_class};
[11680]127
128 my ($secondary_plugin);
129 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130 die "$@" if $@;
[10450]131 $secondary_plugins->{$plugin_class} = $secondary_plugin;
132 }
133 $self->{'secondary_plugins'} = $secondary_plugins;
134}
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
[15871]141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
[10450]143
[15906]144 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
[10453]145
[11680]146 if ($self->{'info_only'}) {
147 # don't worry about any options etc
148 return bless $self, $class;
149 }
150
[10890]151 my $convert_to_type = $self->{'convert_to'};
152 if (!defined $convert_to_type || $convert_to_type eq "") {
153 $convert_to_type = "auto";
154 }
155 my $windows_scripting = $self->{'windows_scripting'};
[11833]156 $windows_scripting = 0 unless defined $windows_scripting;
[15871]157 if ($classPluginName eq "PDFPlugin") {
[10890]158 if ($convert_to_type eq "text" &&
[10503]159 $ENV{'GSDLOS'} =~ /^windows$/i) {
160 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
[10890]161 $convert_to_type = "html";
[10503]162 }
[15871]163 } elsif ($classPluginName eq "WordPlugin") {
[11833]164 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) {
[10503]165 # we use structured HTML, not normal html
[10890]166 $convert_to_type = "structuredhtml";
[10503]167 }
[17723]168 } elsif ($classPluginName eq "PowerPointPlugin") {
[11833]169 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
[10503]170 # we use paged img
[10890]171 $convert_to_type = "pagedimg_jpg";
[10503]172 }
[17726]173 } elsif ($classPluginName eq "PostScriptPlugin") {
[10890]174 if ($convert_to_type eq "auto") {
[10537]175 # we use text
[10890]176 $convert_to_type = "text";
[10537]177 }
[10503]178 }
[10504]179
[10890]180 if ($convert_to_type eq "auto") {
[10453]181 # choose html for now - should choose a format based on doc type
[10890]182 $convert_to_type = "html";
[10450]183 }
184
[10890]185 if ($convert_to_type eq "html") {
[10453]186 $self->{'convert_to'} = "HTML";
[10450]187 $self->{'convert_to_ext'} = "html";
[10890]188 } elsif ($convert_to_type eq "text") {
[15871]189 $self->{'convert_to'} = "Text";
[10450]190 $self->{'convert_to_ext'} = "txt";
[10890]191 } elsif ($convert_to_type eq "structuredhtml") {
[10453]192 $self->{'convert_to'} = "StructuredHTML";
193 $self->{'convert_to_ext'} = "html";
[10890]194 } elsif ($convert_to_type =~ /^pagedimg/) {
[15871]195 $self->{'convert_to'} = "PagedImage";
[10890]196 my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
[10453]197 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
198 $self->{'convert_to_ext'} = $convert_to_ext;
[10450]199 }
[10453]200
[10450]201 return bless $self, $class;
202}
203
204
205sub init {
206 my $self = shift (@_);
207 my ($verbosity, $outhandle, $failhandle) = @_;
208
209 $self->SUPER::init($verbosity,$outhandle,$failhandle);
210
211 my $secondary_plugins = $self->{'secondary_plugins'};
212
213 foreach my $plug_name (keys %$secondary_plugins) {
214 my $plugin = $secondary_plugins->{$plug_name};
215 $plugin->init($verbosity,$outhandle,$failhandle);
216 }
217}
218
219sub deinit {
220 # called only once, after all plugin passes have been done
221
222 my ($self) = @_;
223
224 my $secondary_plugins = $self->{'secondary_plugins'};
225
226 foreach my $plug_name (keys %$secondary_plugins) {
227 my $plugin = $secondary_plugins->{$plug_name};
228 $plugin->deinit();
229 }
230}
231
232sub convert_post_process
233{
234 # by default do no post processing
235 return;
236}
237
238
239# Run conversion utility on the input file.
240#
241# The conversion takes place in a collection specific 'tmp' directory so
242# that we don't accidentally damage the input.
243#
244# The desired output type is indicated by $output_ext. This is usually
245# something like "html" or "word", but can be "best" (or the empty string)
246# to indicate that the conversion utility should do the best it can.
247sub tmp_area_convert_file {
248 my $self = shift (@_);
249 my ($output_ext, $input_filename, $textref) = @_;
250
251 my $outhandle = $self->{'outhandle'};
252 my $convert_to = $self->{'convert_to'};
253 my $failhandle = $self->{'failhandle'};
254 my $convert_to_ext = $self->{'convert_to_ext'};
255
256 # derive tmp filename from input filename
257 my ($tailname, $dirname, $suffix)
258 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
259
[15116]260 # softlink to collection tmp dir
261 my $tmp_dirname = $dirname;
262 if(defined $ENV{'GSDLCOLLECTDIR'}) {
263 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
264 } elsif(defined $ENV{'GSDLHOME'}) {
265 $tmp_dirname = $ENV{'GSDLHOME'};
266 }
267 $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
268 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
[15151]269
270 # The following is not necessary and will cause problems with
271 # replacing_srcdoc_with_html in the GSDLremote case:
[10450]272 # Remove any white space from filename -- no risk of name collision, and
273 # makes later conversion by utils simpler. Leave spaces in path...
274 # tidy up the filename with space, dot, hyphen between
[15151]275 #$tailname =~ s/\s+//g;
276 #$tailname =~ s/\.+//g;
277 #$tailname =~ s/\-+//g;
[12688]278
[16888]279 # convert to utf-8 otherwise we have problems with the doc.xml file later on
280# print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
[16580]281 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
[12688]282
[16888]283 # URLEncode this since htmls with images where the html filename is utf8 don't seem
284 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
285 # files on the filesystem.
[18339]286 $tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
[16888]287
[10450]288 $suffix = lc($suffix);
289 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
[15166]290
291 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
292 # But we can't softlink to relative paths. Therefore, we need to ensure that
293 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
294 my $ensure_path_absolute = 1; # true
295 &util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
[10450]296 my $verbosity = $self->{'verbosity'};
297 if ($verbosity > 0) {
298 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
299 }
300
301 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
302
303 # Execute the conversion command and get the type of the result,
304 # making sure the converter gives us the appropriate output type
305 my $output_type="";
[15871]306 if ($convert_to =~ m/PagedImage/i) {
[10450]307 $output_type = lc($convert_to)."_".lc($convert_to_ext);
308 } else {
309 $output_type = lc($convert_to);
310 }
311
312 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
313 if (defined $self->{'convert_options'}) {
314 $cmd .= $self->{'convert_options'} . " ";
315 }
316 if ($self->{'use_strings'}) {
317 $cmd .= "-use_strings ";
318 }
319 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
320 $output_type = `$cmd`;
321
322 # remove symbolic link to original file
323 &util::rm($tmp_filename);
[15116]324
[10450]325 # Check STDERR here
326 chomp $output_type;
327 if ($output_type eq "fail") {
328 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
329 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
[10994]330 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
331 #$self->{'num_not_processed'} ++;
[10450]332 if (-s "$errlog") {
333 open(ERRLOG, "$errlog");
334 while (<ERRLOG>) {
335 print $outhandle "$_";
336 }
337 print $outhandle "\n";
338 close ERRLOG;
339 }
340 &util::rm("$errlog") if (-e "$errlog");
341 return "";
342 }
343
344 # store the *actual* output type and return the output filename
345 # it's possible we requested conversion to html, but only to text succeeded
346 #$self->{'convert_to_ext'} = $output_type;
347 if ($output_type =~ /html/i) {
348 $self->{'converted_to'} = "HTML";
349 } elsif ($output_type =~ /te?xt/i) {
[15871]350 $self->{'converted_to'} = "Text";
[10450]351 } elsif ($output_type =~ /item/i){
[15871]352 $self->{'converted_to'} = "PagedImage";
[10450]353 }
354
355 my $output_filename = $tmp_filename;
356 if ($output_type =~ /item/i) {
357 # running under windows
358 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
359 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
360 } else {
361 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
362 }
363 } else {
364 $output_filename =~ s/$suffix$/.$output_type/;
365 }
[12688]366
[10450]367 return $output_filename;
368}
369
370
[15871]371# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
372sub read_into_doc_obj {
[10450]373 my $self = shift (@_);
[16392]374 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10450]375
376 my $outhandle = $self->{'outhandle'};
[15871]377
[16392]378 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[15871]379
[10450]380 my $output_ext = $self->{'convert_to_ext'};
381 my $conv_filename = "";
[15871]382 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
[16580]383
[10609]384 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
385 if (! -e "$conv_filename") {return -1;}
[10450]386 $self->{'conv_filename'} = $conv_filename;
387 $self->convert_post_process($conv_filename);
[11008]388
389 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
390 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
[15871]391 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
[11008]392 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
393 if (system($fribidi_command) != 0) {
394 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
395 }
396 else {
397 &util::mv("${conv_filename}.tmp", $conv_filename);
398 }
399 }
400
[10450]401 my $secondary_plugins = $self->{'secondary_plugins'};
402 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
403
404 if ($num_secondary_plugins == 0) {
405 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
406 return 0; # effectively block it
407 }
408
409 my @plugin_names = keys %$secondary_plugins;
410 my $plugin_name = shift @plugin_names;
411
412 if ($num_secondary_plugins > 1) {
413 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
414 }
415
416 my $secondary_plugin = $secondary_plugins->{$plugin_name};
417
418 # note: metadata is not carried on to the next level
[16392]419## **** I just replaced $metadata with {} in following
[10450]420 my ($rv,$doc_obj)
[16392]421 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
[10450]422
423 if ((!defined $rv) || ($rv<1)) {
424 # wasn't processed
425 return $rv;
426 }
427
428 # Override previous gsdlsourcefilename set by secondary plugin
[15871]429 my $collect_file = &util::filename_within_collection($filename_full_path);
[10450]430 my $collect_conv_file = &util::filename_within_collection($conv_filename);
[18320]431 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
[20766]432 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
433 # build. so set it manually.
434 $doc_obj->{'source_path'} = $filename_full_path;
[10450]435 $doc_obj->set_converted_filename($collect_conv_file);
436
[15871]437 $self->set_Source_metadata($doc_obj, $filename_no_path);
438
[10450]439 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[15871]440 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
[10450]441
[19053]442 # ****
443 my ($tailname, $dirname, $suffix)
444 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
445$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
446
[10450]447 # do plugin specific processing of doc_obj
[15871]448 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
[10450]449 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
450 return -1;
451 }
[15871]452
453 my $topsection = $doc_obj->get_top_section();
454 $self->add_associated_files($doc_obj, $filename_full_path);
[16392]455
456 # extra_metadata is already called by sec plugin in process??
[15871]457 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
[10450]458 # do any automatic metadata extraction
459 $self->auto_extract_metadata ($doc_obj);
[10592]460
461 # have we found a Title??
[15871]462 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
[10592]463
[15871]464 $self->add_OID($doc_obj);
[14928]465
[15871]466 return (1, $doc_obj);
[14928]467
[15871]468}
[14928]469
[15871]470sub process {
471 my $self = shift (@_);
472 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[10450]473
[15871]474 return $self->process_type($base_dir, $file, $doc_obj);
[10450]475}
476
477# do plugin specific processing of doc_obj for doc_ext type
478sub process_type {
479 my $self = shift (@_);
[15871]480 my ($base_dir, $file, $doc_obj) = @_;
[10450]481
[15871]482 # need to check that not empty
483 my $doc_ext = $self->{'filename_extension'};
484 my $file_type = "unknown";
485 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
486
[10450]487 # associate original file with doc object
488 my $cursection = $doc_obj->get_top_section();
489 my $filename = &util::filename_cat($base_dir, $file);
[12961]490 my $assocfilename = "doc.$doc_ext";
491 if ($self->{'keep_original_filename'} == 1) {
[16922]492 # this should be the same filename that was used for the Source and SourceFile metadata,
493 # as we will use [SourceFile] in the srclink
[16954]494 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
[12961]495 }
496 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
[10450]497
498 # We use set instead of add here because we only want one value
[15871]499 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
[21760]500 my $srclink_filename = "doc.$doc_ext";
501 #my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/doc.$doc_ext\">";
[12961]502 if ($self->{'keep_original_filename'} == 1) {
[21760]503 $srclink_filename = "[SourceFile]";
504 #$doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[SourceFile]\">";
[12961]505 }
[21760]506 #$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
[10450]507 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
[21760]508 #$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
509 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
[10450]510 return 1;
511}
512
5131;
514
515
516
517
518
519
520
Note: See TracBrowser for help on using the repository browser.