Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 26893

Last change on this file since 26893 was 26893, checked in by kjdon, 11 years ago
ConvertBinaryFile needs to reset the doc OID after all the processing has been done. This will mean it uses the top level plugin OIDtype settings, rather than the secondary plugin ones.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.2 KB

Line
1	###########################################################################
2	#
3	# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4	# through gsConvert.pl
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29	# PostScriptPlugin,
30	# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31	# to either HTML, Text or a series of images. It works by dynamically loading
32	# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33	# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35	package ConvertBinaryFile;
36
37	use AutoExtractMetadata;
38	use ghtml;
39	use HTMLPlugin;
40	use TextPlugin;
41	use PagedImagePlugin;
42
43	use strict;
44	no strict 'refs'; # allow filehandles to be variables and viceversa
45	no strict 'subs';
46	use util;
47
48
49	sub BEGIN {
50	@ConvertBinaryFile::ISA = ('AutoExtractMetadata');
51	}
52
53	my $convert_to_list =
54	[ { 'name' => "auto",
55	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
56	{ 'name' => "html",
57	'desc' => "{ConvertBinaryFile.convert_to.html}" },
58	{ 'name' => "text",
59	'desc' => "{ConvertBinaryFile.convert_to.text}" }
60	];
61
62	my $arguments =
63	[ { 'name' => "convert_to",
64	'desc' => "{ConvertBinaryFile.convert_to}",
65	'type' => "enum",
66	'reqd' => "yes",
67	'list' => $convert_to_list,
68	'deft' => "auto" },
69	{ 'name' => "keep_original_filename",
70	'desc' => "{ConvertBinaryFile.keep_original_filename}",
71	'type' => "flag" },
72	{ 'name' => "title_sub",
73	'desc' => "{HTMLPlugin.title_sub}",
74	'type' => "string",
75	#'type' => "regexp",
76	'deft' => "" },
77	{ 'name' => "apply_fribidi",
78	'desc' => "{ConvertBinaryFile.apply_fribidi}",
79	'type' => "flag",
80	'reqd' => "no" },
81	{ 'name' => "use_strings",
82	'desc' => "{ConvertBinaryFile.use_strings}",
83	'type' => "flag",
84	'reqd' => "no" },
85	];
86
87	my $options = { 'name' => "ConvertBinaryFile",
88	'desc' => "{ConvertBinaryFile.desc}",
89	'abstract' => "yes",
90	'inherits' => "yes",
91	'args' => $arguments };
92
93
94	sub load_secondary_plugins
95	{
96	my $self = shift (@_);
97	my ($class,$input_args,$hashArgOptLists) = @_;
98
99	my @convert_to_list = split(",",$self->{'convert_to_plugin'});
100	my $secondary_plugins = {};
101	# find the plugin
102
103	foreach my $convert_to (@convert_to_list) {
104	# load in "convert_to" plugin package
105	my $plugin_class = $convert_to;
106	my $plugin_package = $plugin_class.".pm";
107
108	my $colplugname = undef;
109	if (defined $ENV{'GSDLCOLLECTDIR'}) {
110	$colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
111	"perllib","plugins",
112	$plugin_package);
113	}
114
115	my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
116	"perllib","plugins",
117	$plugin_package);
118
119	if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
120	elsif (-e $mainplugname) { require $mainplugname; }
121	else {
122	&gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
123	$plugin_class);
124	die "\n";
125	}
126
127	# call its constructor with extra options that we've worked out!
128	my $arglist = $input_args->{$plugin_class};
129
130	my ($secondary_plugin);
131	eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
132	die "$@" if $@;
133	$secondary_plugins->{$plugin_class} = $secondary_plugin;
134	}
135	$self->{'secondary_plugins'} = $secondary_plugins;
136	}
137
138	sub new {
139	my ($class) = shift (@_);
140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141	push(@$pluginlist, $class);
142	my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144	push(@{$hashArgOptLists->{"OptList"}},$options);
145
146	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
147
148	return bless $self, $class;
149	}
150
151	# should be called by subclasses after checking and setting
152	# $self->{'convert_to'}
153	sub set_standard_convert_settings {
154	my $self =shift (@_);
155
156	my $convert_to = $self->{'convert_to'};
157	if ($convert_to eq "auto") {
158	$convert_to = "html";
159	$self->{'convert_to'} = "html";
160	}
161
162	if ($convert_to =~ /^html/) { # may be html or html_multi
163	$self->{'convert_to_plugin'} = "HTMLPlugin";
164	$self->{'convert_to_ext'} = "html";
165	} elsif ($convert_to eq "text") {
166	$self->{'convert_to_plugin'} = "TextPlugin";
167	$self->{'convert_to_ext'} = "txt";
168	} elsif ($convert_to eq "structuredhtml") {
169	$self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
170	$self->{'convert_to_ext'} = "html";
171	} elsif ($convert_to =~ /^pagedimg/) {
172	$self->{'convert_to_plugin'} = "PagedImagePlugin";
173	my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg\|gif\|png)/i;
174	$convert_to_ext = 'jpg' unless defined $convert_to_ext;
175	$self->{'convert_to_ext'} = $convert_to_ext;
176	}
177	}
178	sub init {
179	my $self = shift (@_);
180	my ($verbosity, $outhandle, $failhandle) = @_;
181
182	$self->SUPER::init($verbosity,$outhandle,$failhandle);
183
184	my $secondary_plugins = $self->{'secondary_plugins'};
185
186	foreach my $plug_name (keys %$secondary_plugins) {
187	my $plugin = $secondary_plugins->{$plug_name};
188	$plugin->init($verbosity,$outhandle,$failhandle);
189	}
190	}
191
192	sub deinit {
193	# called only once, after all plugin passes have been done
194
195	my ($self) = @_;
196
197	my $secondary_plugins = $self->{'secondary_plugins'};
198
199	foreach my $plug_name (keys %$secondary_plugins) {
200	my $plugin = $secondary_plugins->{$plug_name};
201	$plugin->deinit();
202	}
203	}
204
205	sub convert_post_process
206	{
207	# by default do no post processing
208	return;
209	}
210
211
212	# Run conversion utility on the input file.
213	#
214	# The conversion takes place in a collection specific 'tmp' directory so
215	# that we don't accidentally damage the input.
216	#
217	# The desired output type is indicated by $output_ext. This is usually
218	# something like "html" or "word", but can be "best" (or the empty string)
219	# to indicate that the conversion utility should do the best it can.
220	sub tmp_area_convert_file {
221	my $self = shift (@_);
222	my ($output_ext, $input_filename, $textref) = @_;
223
224	my $outhandle = $self->{'outhandle'};
225	my $convert_to = $self->{'convert_to'};
226	my $failhandle = $self->{'failhandle'};
227	my $convert_to_ext = $self->{'convert_to_ext'};
228
229
230	my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
231
232	# derive tmp filename from input filename
233	my ($tailname, $dirname, $suffix)
234	= &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
235
236	# softlink to collection tmp dir
237	my $tmp_dirname = &util::get_timestamped_tmp_folder();
238	if (defined $tmp_dirname) {
239	$self->{'tmp_dir'} = $tmp_dirname;
240	} else {
241	$tmp_dirname = $dirname;
242	}
243
244	# # convert to utf-8 otherwise we have problems with the doc.xml file later on
245	# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
246
247	# make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
248	my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
249
250
251	# URLEncode this since htmls with images where the html filename is utf8 don't seem
252	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
253	# files on the filesystem.
254	$utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
255
256	my $lc_suffix = lc($suffix);
257	my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
258
259	# If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
260	# But we can't softlink to relative paths. Therefore, we need to ensure that
261	# the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
262	my $ensure_path_absolute = 1; # true
263	&util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
264	my $verbosity = $self->{'verbosity'};
265	if ($verbosity > 0) {
266	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
267	}
268
269	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
270
271	# Execute the conversion command and get the type of the result,
272	# making sure the converter gives us the appropriate output type
273	my $output_type=$self->{'convert_to'};
274	# if ($convert_to =~ m/PagedImage/i) {
275	# $output_type = lc($convert_to)."_".lc($convert_to_ext);
276	# } else {
277	# $output_type = lc($convert_to);
278	# }
279
280	my $cmd = "\"".&util::get_perl_exec()."\" -S gsConvert.pl -verbose $verbosity ";
281	if (defined $self->{'convert_options'}) {
282	$cmd .= $self->{'convert_options'} . " ";
283	}
284	if ($self->{'use_strings'}) {
285	$cmd .= "-use_strings ";
286	}
287	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
288	print STDERR "calling cmd $cmd\n";
289	$output_type = `$cmd`;
290
291	# remove symbolic link to original file
292	&util::rm($tmp_filename);
293
294	# Check STDERR here
295	chomp $output_type;
296	if ($output_type eq "fail") {
297	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
298	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
299	# The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
300	#$self->{'num_not_processed'} ++;
301	if (-s "$errlog") {
302	open(ERRLOG, "$errlog");
303	while (<ERRLOG>) {
304	print $outhandle "$_";
305	}
306	print $outhandle "\n";
307	close ERRLOG;
308	}
309	&util::rm("$errlog") if (-e "$errlog");
310	return "";
311	}
312
313	# store the actual output type and return the output filename
314	# it's possible we requested conversion to html, but only to text succeeded
315	#$self->{'convert_to_ext'} = $output_type;
316	if ($output_type =~ /html/i) {
317	$self->{'converted_to'} = "HTML";
318	} elsif ($output_type =~ /te?xt/i) {
319	$self->{'converted_to'} = "Text";
320	} elsif ($output_type =~ /item/i){
321	$self->{'converted_to'} = "PagedImage";
322	}
323
324	my $output_filename = $tmp_filename;
325	if ($output_type =~ /item/i) {
326	# running under windows
327	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
328	$output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
329	} else {
330	$output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
331	}
332	} else {
333	$output_filename =~ s/$lc_suffix$/.$output_type/;
334	}
335
336	return $output_filename;
337	}
338
339
340	# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
341	sub read_into_doc_obj {
342	my $self = shift (@_);
343	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
344
345	my $outhandle = $self->{'outhandle'};
346
347	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
348
349	my $output_ext = $self->{'convert_to_ext'};
350	my $conv_filename = "";
351	$conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
352
353	if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
354	if (! -e "$conv_filename") {return -1;}
355	$self->{'conv_filename'} = $conv_filename;
356	$self->convert_post_process($conv_filename);
357
358	# Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
359	# Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
360	if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML\|Text)/) {
361	my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
362	if (system($fribidi_command) != 0) {
363	print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
364	}
365	else {
366	&util::mv("${conv_filename}.tmp", $conv_filename);
367	}
368	}
369
370	my $secondary_plugins = $self->{'secondary_plugins'};
371	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
372
373	if ($num_secondary_plugins == 0) {
374	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
375	return 0; # effectively block it
376	}
377
378	my @plugin_names = keys %$secondary_plugins;
379	my $plugin_name = shift @plugin_names;
380
381	if ($num_secondary_plugins > 1) {
382	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
383	}
384
385	my $secondary_plugin = $secondary_plugins->{$plugin_name};
386
387	# note: metadata is not carried on to the next level
388	## **** I just replaced $metadata with {} in following
389	my ($rv,$doc_obj)
390	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
391
392	if ((!defined $rv) \|\| ($rv<1)) {
393	# wasn't processed
394	return $rv;
395	}
396
397	# Override previous gsdlsourcefilename set by secondary plugin
398	my $collect_file = &util::filename_within_collection($filename_full_path);
399	my $collect_conv_file = &util::filename_within_collection($conv_filename);
400	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
401	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
402	# build. so set it manually.
403	$doc_obj->set_source_path($filename_full_path);
404	$doc_obj->set_converted_filename($collect_conv_file);
405
406	my $plugin_filename_encoding = $self->{'filename_encoding'};
407	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
408	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
409
410	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
411	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
412
413	# ****
414	my ($tailname, $dirname, $suffix)
415	= &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
416	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
417
418	# do plugin specific processing of doc_obj
419	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
420	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
421	return -1;
422	}
423
424	my $topsection = $doc_obj->get_top_section();
425	$self->add_associated_files($doc_obj, $filename_full_path);
426
427	# extra_metadata is already called by sec plugin in process??
428	$self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
429	# do any automatic metadata extraction
430	$self->auto_extract_metadata ($doc_obj);
431
432	# have we found a Title??
433	$self->title_fallback($doc_obj,$topsection,$filename_no_path);
434
435	# force a new OID - this will use OIDtype option set for this plugin.
436	$self->add_OID($doc_obj, 1);
437
438	return (1, $doc_obj);
439
440	}
441
442	sub process {
443	my $self = shift (@_);
444	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
445
446	return $self->process_type($base_dir, $file, $doc_obj);
447	}
448
449	# do plugin specific processing of doc_obj for doc_ext type
450	sub process_type {
451	my $self = shift (@_);
452	my ($base_dir, $file, $doc_obj) = @_;
453
454	# need to check that not empty
455	my ($doc_ext) = $file =~ /\.(\w+)$/;
456	$doc_ext = lc($doc_ext);
457	my $file_type = "unknown";
458	$file_type = $self->{'file_type'} if defined $self->{'file_type'};
459
460	# associate original file with doc object
461	my $cursection = $doc_obj->get_top_section();
462	my $filename = &util::filename_cat($base_dir, $file);
463	my $assocfilename = "doc.$doc_ext";
464	if ($self->{'keep_original_filename'} == 1) {
465	# this should be the same filename that was used for the Source and SourceFile metadata,
466	# as we will use SourceFile in the srclink (below)
467	$assocfilename = $doc_obj->get_assocfile_from_sourcefile();
468	}
469
470	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
471
472	# We use set instead of add here because we only want one value
473	$doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
474	my $srclink_filename = "doc.$doc_ext";
475	if ($self->{'keep_original_filename'} == 1) {
476	$srclink_filename = $doc_obj->get_sourcefile();
477	}
478	# srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
479	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
480	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
481	$doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $srclink_filename);
482	return 1;
483	}
484
485	sub clean_up_after_doc_obj_processing {
486	my $self = shift(@_);
487
488	my $tmp_dir = $self->{'tmp_dir'};
489	if (defined $tmp_dir && -d $tmp_dir) {
490	##print STDERR "**** Suppressing clean up of tmp dir\n";
491	&util::rm_r($tmp_dir);
492	$self->{'tmp_dir'} = undef;
493	}
494
495
496	}
497	1;
498
499
500
501
502
503
504

Note: See TracBrowser for help on using the repository browser.

Download in other formats: