Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 28489

Last change on this file since 28489 was 28381, checked in by ak19, 11 years ago
Bugfix. When dealing with filenames with special characters that are converted to URL encoding, on Windows ConvertBinaryFile wasn't looking for the new filename but the original one. It needs to be looking for the new filename. Problem noticed with a ppt file sent in to the mailing list.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.4 KB

Line
1	###########################################################################
2	#
3	# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4	# through gsConvert.pl
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29	# PostScriptPlugin,
30	# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31	# to either HTML, Text or a series of images. It works by dynamically loading
32	# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33	# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35	package ConvertBinaryFile;
36
37	use AutoExtractMetadata;
38	use ghtml;
39	use HTMLPlugin;
40	use TextPlugin;
41	use PagedImagePlugin;
42
43	use strict;
44	no strict 'refs'; # allow filehandles to be variables and viceversa
45	no strict 'subs';
46	use util;
47	use FileUtils;
48
49
50	sub BEGIN {
51	@ConvertBinaryFile::ISA = ('AutoExtractMetadata');
52	}
53
54	my $convert_to_list =
55	[ { 'name' => "auto",
56	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
57	{ 'name' => "html",
58	'desc' => "{ConvertBinaryFile.convert_to.html}" },
59	{ 'name' => "text",
60	'desc' => "{ConvertBinaryFile.convert_to.text}" }
61	];
62
63	my $arguments =
64	[ { 'name' => "convert_to",
65	'desc' => "{ConvertBinaryFile.convert_to}",
66	'type' => "enum",
67	'reqd' => "yes",
68	'list' => $convert_to_list,
69	'deft' => "auto" },
70	{ 'name' => "keep_original_filename",
71	'desc' => "{ConvertBinaryFile.keep_original_filename}",
72	'type' => "flag" },
73	{ 'name' => "title_sub",
74	'desc' => "{HTMLPlugin.title_sub}",
75	'type' => "string",
76	#'type' => "regexp",
77	'deft' => "" },
78	{ 'name' => "apply_fribidi",
79	'desc' => "{ConvertBinaryFile.apply_fribidi}",
80	'type' => "flag",
81	'reqd' => "no" },
82	{ 'name' => "use_strings",
83	'desc' => "{ConvertBinaryFile.use_strings}",
84	'type' => "flag",
85	'reqd' => "no" },
86	];
87
88	my $options = { 'name' => "ConvertBinaryFile",
89	'desc' => "{ConvertBinaryFile.desc}",
90	'abstract' => "yes",
91	'inherits' => "yes",
92	'args' => $arguments };
93
94
95	sub load_secondary_plugins
96	{
97	my $self = shift (@_);
98	my ($class,$input_args,$hashArgOptLists) = @_;
99
100	my @convert_to_list = split(",",$self->{'convert_to_plugin'});
101	my $secondary_plugins = {};
102	# find the plugin
103
104	foreach my $convert_to (@convert_to_list) {
105	# load in "convert_to" plugin package
106	my $plugin_class = $convert_to;
107	my $plugin_package = $plugin_class.".pm";
108
109	my $colplugname = undef;
110	if (defined $ENV{'GSDLCOLLECTDIR'}) {
111	$colplugname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},
112	"perllib","plugins",
113	$plugin_package);
114	}
115
116	my $mainplugname = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},
117	"perllib","plugins",
118	$plugin_package);
119
120	if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
121	elsif (-e $mainplugname) { require $mainplugname; }
122	else {
123	&gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
124	$plugin_class);
125	die "\n";
126	}
127
128	# call its constructor with extra options that we've worked out!
129	my $arglist = $input_args->{$plugin_class};
130
131	my ($secondary_plugin);
132	eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
133	die "$@" if $@;
134	$secondary_plugins->{$plugin_class} = $secondary_plugin;
135	}
136	$self->{'secondary_plugins'} = $secondary_plugins;
137	}
138
139	sub new {
140	my ($class) = shift (@_);
141	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
142	push(@$pluginlist, $class);
143	my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
144	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
145	push(@{$hashArgOptLists->{"OptList"}},$options);
146
147	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
148
149	return bless $self, $class;
150	}
151
152	# should be called by subclasses after checking and setting
153	# $self->{'convert_to'}
154	sub set_standard_convert_settings {
155	my $self =shift (@_);
156
157	my $convert_to = $self->{'convert_to'};
158	if ($convert_to eq "auto") {
159	$convert_to = "html";
160	$self->{'convert_to'} = "html";
161	}
162
163	if ($convert_to =~ /^html/) { # may be html or html_multi
164	$self->{'convert_to_plugin'} = "HTMLPlugin";
165	$self->{'convert_to_ext'} = "html";
166	} elsif ($convert_to eq "text") {
167	$self->{'convert_to_plugin'} = "TextPlugin";
168	$self->{'convert_to_ext'} = "txt";
169	} elsif ($convert_to eq "structuredhtml") {
170	$self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
171	$self->{'convert_to_ext'} = "html";
172	} elsif ($convert_to =~ /^pagedimg/) {
173	$self->{'convert_to_plugin'} = "PagedImagePlugin";
174	my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg\|gif\|png)/i;
175	$convert_to_ext = 'jpg' unless defined $convert_to_ext;
176	$self->{'convert_to_ext'} = $convert_to_ext;
177	}
178	}
179	sub init {
180	my $self = shift (@_);
181	my ($verbosity, $outhandle, $failhandle) = @_;
182
183	$self->SUPER::init($verbosity,$outhandle,$failhandle);
184
185	my $secondary_plugins = $self->{'secondary_plugins'};
186
187	foreach my $plug_name (keys %$secondary_plugins) {
188	my $plugin = $secondary_plugins->{$plug_name};
189	$plugin->init($verbosity,$outhandle,$failhandle);
190	}
191	}
192
193	sub deinit {
194	# called only once, after all plugin passes have been done
195
196	my ($self) = @_;
197
198	my $secondary_plugins = $self->{'secondary_plugins'};
199
200	foreach my $plug_name (keys %$secondary_plugins) {
201	my $plugin = $secondary_plugins->{$plug_name};
202	$plugin->deinit();
203	}
204	}
205
206	sub convert_post_process
207	{
208	# by default do no post processing
209	return;
210	}
211
212
213	# Run conversion utility on the input file.
214	#
215	# The conversion takes place in a collection specific 'tmp' directory so
216	# that we don't accidentally damage the input.
217	#
218	# The desired output type is indicated by $output_ext. This is usually
219	# something like "html" or "word", but can be "best" (or the empty string)
220	# to indicate that the conversion utility should do the best it can.
221	sub tmp_area_convert_file {
222	my $self = shift (@_);
223	my ($output_ext, $input_filename, $textref) = @_;
224
225	my $outhandle = $self->{'outhandle'};
226	my $convert_to = $self->{'convert_to'};
227	my $failhandle = $self->{'failhandle'};
228	my $convert_to_ext = $self->{'convert_to_ext'};
229
230
231	my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
232
233	# derive tmp filename from input filename
234	my ($tailname, $dirname, $suffix)
235	= &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
236
237	# softlink to collection tmp dir
238	my $tmp_dirname = &util::get_timestamped_tmp_folder();
239	if (defined $tmp_dirname) {
240	$self->{'tmp_dir'} = $tmp_dirname;
241	} else {
242	$tmp_dirname = $dirname;
243	}
244
245	# # convert to utf-8 otherwise we have problems with the doc.xml file later on
246	# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
247
248	# make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
249	my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
250
251
252	# URLEncode this since htmls with images where the html filename is utf8 don't seem
253	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
254	# files on the filesystem.
255	$utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
256
257	my $lc_suffix = lc($suffix);
258	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
259
260	# If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
261	# But we can't softlink to relative paths. Therefore, we need to ensure that
262	# the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
263	my $ensure_path_absolute = 1; # true
264	&FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
265	my $verbosity = $self->{'verbosity'};
266	if ($verbosity > 0) {
267	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
268	}
269
270	my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
271
272	# Execute the conversion command and get the type of the result,
273	# making sure the converter gives us the appropriate output type
274	my $output_type=$self->{'convert_to'};
275	# if ($convert_to =~ m/PagedImage/i) {
276	# $output_type = lc($convert_to)."_".lc($convert_to_ext);
277	# } else {
278	# $output_type = lc($convert_to);
279	# }
280
281	my $cmd = "\"".&util::get_perl_exec()."\" -S gsConvert.pl -verbose $verbosity ";
282	if (defined $self->{'convert_options'}) {
283	$cmd .= $self->{'convert_options'} . " ";
284	}
285	if ($self->{'use_strings'}) {
286	$cmd .= "-use_strings ";
287	}
288	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
289	print STDERR "calling cmd $cmd\n";
290	$output_type = `$cmd`;
291
292	# remove symbolic link to original file
293	&FileUtils::removeFiles($tmp_filename);
294
295	# Check STDERR here
296	chomp $output_type;
297	if ($output_type eq "fail") {
298	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
299	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
300	# The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
301	#$self->{'num_not_processed'} ++;
302	if (-s "$errlog") {
303	open(ERRLOG, "$errlog");
304	while (<ERRLOG>) {
305	print $outhandle "$_";
306	}
307	print $outhandle "\n";
308	close ERRLOG;
309	}
310	&FileUtils::removeFiles("$errlog") if (-e "$errlog");
311	return "";
312	}
313
314	# store the actual output type and return the output filename
315	# it's possible we requested conversion to html, but only to text succeeded
316	#$self->{'convert_to_ext'} = $output_type;
317	if ($output_type =~ /html/i) {
318	$self->{'converted_to'} = "HTML";
319	} elsif ($output_type =~ /te?xt/i) {
320	$self->{'converted_to'} = "Text";
321	} elsif ($output_type =~ /item/i){
322	$self->{'converted_to'} = "PagedImage";
323	}
324
325	my $output_filename = $tmp_filename;
326	if ($output_type =~ /item/i) {
327	# running under windows
328	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
329	$output_filename = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname . ".$output_type";
330	} else {
331	$output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type";
332	}
333	} else {
334	$output_filename =~ s/$lc_suffix$/.$output_type/;
335	}
336
337	return $output_filename;
338	}
339
340
341	# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
342	sub read_into_doc_obj {
343	my $self = shift (@_);
344	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
345
346	my $outhandle = $self->{'outhandle'};
347
348	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
349
350	my $output_ext = $self->{'convert_to_ext'};
351	my $conv_filename = "";
352	$conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
353
354	if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
355	if (! -e "$conv_filename") {return -1;}
356	$self->{'conv_filename'} = $conv_filename;
357	$self->convert_post_process($conv_filename);
358
359	# Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
360	# Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
361	if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML\|Text)/) {
362	my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
363	if (system($fribidi_command) != 0) {
364	print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
365	}
366	else {
367	&FileUtils::moveFiles("${conv_filename}.tmp", $conv_filename);
368	}
369	}
370
371	my $secondary_plugins = $self->{'secondary_plugins'};
372	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
373
374	if ($num_secondary_plugins == 0) {
375	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
376	return 0; # effectively block it
377	}
378
379	my @plugin_names = keys %$secondary_plugins;
380	my $plugin_name = shift @plugin_names;
381
382	if ($num_secondary_plugins > 1) {
383	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
384	}
385
386	my $secondary_plugin = $secondary_plugins->{$plugin_name};
387
388	# note: metadata is not carried on to the next level
389	## **** I just replaced $metadata with {} in following
390	my ($rv,$doc_obj)
391	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
392
393	if ((!defined $rv) \|\| ($rv<1)) {
394	# wasn't processed
395	return $rv;
396	}
397
398	# Override previous gsdlsourcefilename set by secondary plugin
399	my $collect_file = &util::filename_within_collection($filename_full_path);
400	my $collect_conv_file = &util::filename_within_collection($conv_filename);
401	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
402	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
403	# build. so set it manually.
404	$doc_obj->set_source_path($filename_full_path);
405	$doc_obj->set_converted_filename($collect_conv_file);
406
407	my $plugin_filename_encoding = $self->{'filename_encoding'};
408	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
409	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
410
411	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
412	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
413
414	# ****
415	my ($tailname, $dirname, $suffix)
416	= &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
417	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
418
419	# do plugin specific processing of doc_obj
420	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
421	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
422	return -1;
423	}
424
425	my $topsection = $doc_obj->get_top_section();
426	$self->add_associated_files($doc_obj, $filename_full_path);
427
428	# extra_metadata is already called by sec plugin in process??
429	$self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
430	# do any automatic metadata extraction
431	$self->auto_extract_metadata ($doc_obj);
432
433	# have we found a Title??
434	$self->title_fallback($doc_obj,$topsection,$filename_no_path);
435
436	# force a new OID - this will use OIDtype option set for this plugin.
437	$self->add_OID($doc_obj, 1);
438
439	return (1, $doc_obj);
440
441	}
442
443	sub process {
444	my $self = shift (@_);
445	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
446
447	return $self->process_type($base_dir, $file, $doc_obj);
448	}
449
450	# do plugin specific processing of doc_obj for doc_ext type
451	sub process_type {
452	my $self = shift (@_);
453	my ($base_dir, $file, $doc_obj) = @_;
454
455	# need to check that not empty
456	my ($doc_ext) = $file =~ /\.(\w+)$/;
457	$doc_ext = lc($doc_ext);
458	my $file_type = "unknown";
459	$file_type = $self->{'file_type'} if defined $self->{'file_type'};
460
461	# associate original file with doc object
462	my $cursection = $doc_obj->get_top_section();
463	my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
464	my $assocfilename = "doc.$doc_ext";
465	if ($self->{'keep_original_filename'} == 1) {
466	# this should be the same filename that was used for the Source and SourceFile metadata,
467	# as we will use SourceFile in the srclink (below)
468	$assocfilename = $doc_obj->get_assocfile_from_sourcefile();
469	}
470
471	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
472
473	# We use set instead of add here because we only want one value
474	$doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
475	my $srclink_filename = "doc.$doc_ext";
476	if ($self->{'keep_original_filename'} == 1) {
477	$srclink_filename = $doc_obj->get_sourcefile();
478	}
479	# srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
480	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
481	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
482	$doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $srclink_filename);
483	return 1;
484	}
485
486	sub clean_up_after_doc_obj_processing {
487	my $self = shift(@_);
488
489	my $tmp_dir = $self->{'tmp_dir'};
490	if (defined $tmp_dir && -d $tmp_dir) {
491	##print STDERR "**** Suppressing clean up of tmp dir\n";
492	&FileUtils::removeFilesRecursive($tmp_dir);
493	$self->{'tmp_dir'} = undef;
494	}
495
496
497	}
498	1;
499
500
501
502
503
504
505

Note: See TracBrowser for help on using the repository browser.

Download in other formats: