Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 24290

Last change on this file since 24290 was 24290, checked in by sjm84, 13 years ago
Several changes to how Greenstone hashes PDF files and also added several more options to the EmbeddedMetadataPlugin
Property svn:keywords set to `Author Date Id Revision`
File size: 17.2 KB

Line
1	###########################################################################
2	#
3	# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4	# through gsConvert.pl
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29	# PostScriptPlugin,
30	# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31	# to either HTML, Text or a series of images. It works by dynamically loading
32	# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33	# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35	package ConvertBinaryFile;
36
37	use AutoExtractMetadata;
38	use ghtml;
39	use HTMLPlugin;
40	use TextPlugin;
41	use PagedImagePlugin;
42
43	use strict;
44	no strict 'refs'; # allow filehandles to be variables and viceversa
45	no strict 'subs';
46	use Config; # for getting the perlpath in the recommended way
47
48
49	sub BEGIN {
50	@ConvertBinaryFile::ISA = ('AutoExtractMetadata');
51	}
52
53	my $convert_to_list =
54	[ { 'name' => "auto",
55	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
56	{ 'name' => "html",
57	'desc' => "{ConvertBinaryFile.convert_to.html}" },
58	{ 'name' => "text",
59	'desc' => "{ConvertBinaryFile.convert_to.text}" }
60	];
61
62	my $arguments =
63	[ { 'name' => "convert_to",
64	'desc' => "{ConvertBinaryFile.convert_to}",
65	'type' => "enum",
66	'reqd' => "yes",
67	'list' => $convert_to_list,
68	'deft' => "auto" },
69	{ 'name' => "keep_original_filename",
70	'desc' => "{ConvertBinaryFile.keep_original_filename}",
71	'type' => "flag" },
72	{ 'name' => "title_sub",
73	'desc' => "{HTMLPlugin.title_sub}",
74	'type' => "string",
75	#'type' => "regexp",
76	'deft' => "" },
77	{ 'name' => "apply_fribidi",
78	'desc' => "{ConvertBinaryFile.apply_fribidi}",
79	'type' => "flag",
80	'reqd' => "no" },
81	{ 'name' => "use_strings",
82	'desc' => "{ConvertBinaryFile.use_strings}",
83	'type' => "flag",
84	'reqd' => "no" },
85	];
86
87	my $options = { 'name' => "ConvertBinaryFile",
88	'desc' => "{ConvertBinaryFile.desc}",
89	'abstract' => "yes",
90	'inherits' => "yes",
91	'args' => $arguments };
92
93
94	sub load_secondary_plugins
95	{
96	my $self = shift (@_);
97	my ($class,$input_args,$hashArgOptLists) = @_;
98
99	my @convert_to_list = split(",",$self->{'convert_to_plugin'});
100	my $secondary_plugins = {};
101	# find the plugin
102
103	foreach my $convert_to (@convert_to_list) {
104	# load in "convert_to" plugin package
105	my $plugin_class = $convert_to;
106	my $plugin_package = $plugin_class.".pm";
107
108	my $colplugname = undef;
109	if (defined $ENV{'GSDLCOLLECTDIR'}) {
110	$colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
111	"perllib","plugins",
112	$plugin_package);
113	}
114
115	my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
116	"perllib","plugins",
117	$plugin_package);
118
119	if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
120	elsif (-e $mainplugname) { require $mainplugname; }
121	else {
122	&gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
123	$plugin_class);
124	die "\n";
125	}
126
127	# call its constructor with extra options that we've worked out!
128	my $arglist = $input_args->{$plugin_class};
129
130	my ($secondary_plugin);
131	eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
132	die "$@" if $@;
133	$secondary_plugins->{$plugin_class} = $secondary_plugin;
134	}
135	$self->{'secondary_plugins'} = $secondary_plugins;
136	}
137
138	sub new {
139	my ($class) = shift (@_);
140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141	push(@$pluginlist, $class);
142	my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144	push(@{$hashArgOptLists->{"OptList"}},$options);
145
146	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
147
148	return bless $self, $class;
149	}
150
151	# should be called by subclasses after checking and setting
152	# $self->{'convert_to'}
153	sub set_standard_convert_settings {
154	my $self =shift (@_);
155
156	my $convert_to = $self->{'convert_to'};
157	if ($convert_to eq "auto") {
158	$convert_to = "html";
159	$self->{'convert_to'} = "html";
160	}
161
162	if ($convert_to =~ /^html/) { # may be html or html_multi
163	$self->{'convert_to_plugin'} = "HTMLPlugin";
164	$self->{'convert_to_ext'} = "html";
165	} elsif ($convert_to eq "text") {
166	$self->{'convert_to_plugin'} = "TextPlugin";
167	$self->{'convert_to_ext'} = "txt";
168	} elsif ($convert_to eq "structuredhtml") {
169	$self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
170	$self->{'convert_to_ext'} = "html";
171	} elsif ($convert_to =~ /^pagedimg/) {
172	$self->{'convert_to_plugin'} = "PagedImagePlugin";
173	my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg\|gif\|png)/i;
174	$convert_to_ext = 'jpg' unless defined $convert_to_ext;
175	$self->{'convert_to_ext'} = $convert_to_ext;
176	}
177
178	}
179	sub init {
180	my $self = shift (@_);
181	my ($verbosity, $outhandle, $failhandle) = @_;
182
183	$self->SUPER::init($verbosity,$outhandle,$failhandle);
184
185	my $secondary_plugins = $self->{'secondary_plugins'};
186
187	foreach my $plug_name (keys %$secondary_plugins) {
188	my $plugin = $secondary_plugins->{$plug_name};
189	$plugin->init($verbosity,$outhandle,$failhandle);
190	}
191	}
192
193	sub deinit {
194	# called only once, after all plugin passes have been done
195
196	my ($self) = @_;
197
198	my $secondary_plugins = $self->{'secondary_plugins'};
199
200	foreach my $plug_name (keys %$secondary_plugins) {
201	my $plugin = $secondary_plugins->{$plug_name};
202	$plugin->deinit();
203	}
204	}
205
206	sub convert_post_process
207	{
208	# by default do no post processing
209	return;
210	}
211
212
213	# Run conversion utility on the input file.
214	#
215	# The conversion takes place in a collection specific 'tmp' directory so
216	# that we don't accidentally damage the input.
217	#
218	# The desired output type is indicated by $output_ext. This is usually
219	# something like "html" or "word", but can be "best" (or the empty string)
220	# to indicate that the conversion utility should do the best it can.
221	sub tmp_area_convert_file {
222	my $self = shift (@_);
223	my ($output_ext, $input_filename, $textref) = @_;
224
225	my $outhandle = $self->{'outhandle'};
226	my $convert_to = $self->{'convert_to'};
227	my $failhandle = $self->{'failhandle'};
228	my $convert_to_ext = $self->{'convert_to_ext'};
229
230
231	my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
232
233	# derive tmp filename from input filename
234	my ($tailname, $dirname, $suffix)
235	= &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
236
237	# softlink to collection tmp dir
238	my $tmp_dirname = &util::get_timestamped_tmp_folder();
239	if (defined $tmp_dirname) {
240	$self->{'tmp_dir'} = $tmp_dirname;
241	} else {
242	$tmp_dirname = $dirname;
243	}
244
245	# # convert to utf-8 otherwise we have problems with the doc.xml file later on
246	# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
247
248	# make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
249	my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
250
251
252	# URLEncode this since htmls with images where the html filename is utf8 don't seem
253	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
254	# files on the filesystem.
255	$utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
256
257	my $lc_suffix = lc($suffix);
258	my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
259
260	# If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
261	# But we can't softlink to relative paths. Therefore, we need to ensure that
262	# the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
263	my $ensure_path_absolute = 1; # true
264	&util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
265	my $verbosity = $self->{'verbosity'};
266	if ($verbosity > 0) {
267	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
268	}
269
270	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
271
272	# Execute the conversion command and get the type of the result,
273	# making sure the converter gives us the appropriate output type
274	my $output_type=$self->{'convert_to'};
275	# if ($convert_to =~ m/PagedImage/i) {
276	# $output_type = lc($convert_to)."_".lc($convert_to_ext);
277	# } else {
278	# $output_type = lc($convert_to);
279	# }
280
281	my $cmd = "\"$Config{perlpath}\" -S gsConvert.pl -verbose $verbosity ";
282	if (defined $self->{'convert_options'}) {
283	$cmd .= $self->{'convert_options'} . " ";
284	}
285	if ($self->{'use_strings'}) {
286	$cmd .= "-use_strings ";
287	}
288	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
289	print STDERR "calling cmd $cmd\n";
290	$output_type = `$cmd`;
291
292	# remove symbolic link to original file
293	&util::rm($tmp_filename);
294
295	# Check STDERR here
296	chomp $output_type;
297	if ($output_type eq "fail") {
298	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
299	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
300	# The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
301	#$self->{'num_not_processed'} ++;
302	if (-s "$errlog") {
303	open(ERRLOG, "$errlog");
304	while (<ERRLOG>) {
305	print $outhandle "$_";
306	}
307	print $outhandle "\n";
308	close ERRLOG;
309	}
310	&util::rm("$errlog") if (-e "$errlog");
311	return "";
312	}
313
314	# store the actual output type and return the output filename
315	# it's possible we requested conversion to html, but only to text succeeded
316	#$self->{'convert_to_ext'} = $output_type;
317	if ($output_type =~ /html/i) {
318	$self->{'converted_to'} = "HTML";
319	} elsif ($output_type =~ /te?xt/i) {
320	$self->{'converted_to'} = "Text";
321	} elsif ($output_type =~ /item/i){
322	$self->{'converted_to'} = "PagedImage";
323	}
324
325	my $output_filename = $tmp_filename;
326	if ($output_type =~ /item/i) {
327	# running under windows
328	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
329	$output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
330	} else {
331	$output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
332	}
333	} else {
334	$output_filename =~ s/$lc_suffix$/.$output_type/;
335	}
336
337	return $output_filename;
338	}
339
340
341	# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
342	sub read_into_doc_obj {
343	my $self = shift (@_);
344	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
345
346	my $outhandle = $self->{'outhandle'};
347
348	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
349
350	my $output_ext = $self->{'convert_to_ext'};
351	my $conv_filename = "";
352	$conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
353
354	if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
355	if (! -e "$conv_filename") {return -1;}
356	$self->{'conv_filename'} = $conv_filename;
357	$self->convert_post_process($conv_filename);
358
359	# Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
360	# Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
361	if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML\|Text)/) {
362	my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
363	if (system($fribidi_command) != 0) {
364	print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
365	}
366	else {
367	&util::mv("${conv_filename}.tmp", $conv_filename);
368	}
369	}
370
371	my $secondary_plugins = $self->{'secondary_plugins'};
372	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
373
374	if ($num_secondary_plugins == 0) {
375	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
376	return 0; # effectively block it
377	}
378
379	my @plugin_names = keys %$secondary_plugins;
380	my $plugin_name = shift @plugin_names;
381
382	if ($num_secondary_plugins > 1) {
383	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
384	}
385
386	my $secondary_plugin = $secondary_plugins->{$plugin_name};
387
388	# note: metadata is not carried on to the next level
389	## **** I just replaced $metadata with {} in following
390	my ($rv,$doc_obj)
391	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
392
393	if ((!defined $rv) \|\| ($rv<1)) {
394	# wasn't processed
395	return $rv;
396	}
397
398	# Override previous gsdlsourcefilename set by secondary plugin
399	my $collect_file = &util::filename_within_collection($filename_full_path);
400	my $collect_conv_file = &util::filename_within_collection($conv_filename);
401	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
402	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
403	# build. so set it manually.
404	$doc_obj->set_source_path($filename_full_path);
405	$doc_obj->set_converted_filename($collect_conv_file);
406
407	my $plugin_filename_encoding = $self->{'filename_encoding'};
408	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
409	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
410
411	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
412	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
413
414	# ****
415	my ($tailname, $dirname, $suffix)
416	= &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
417	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
418
419	# do plugin specific processing of doc_obj
420	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
421	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
422	return -1;
423	}
424
425	my $topsection = $doc_obj->get_top_section();
426	$self->add_associated_files($doc_obj, $filename_full_path);
427
428	# extra_metadata is already called by sec plugin in process??
429	$self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
430	# do any automatic metadata extraction
431	$self->auto_extract_metadata ($doc_obj);
432
433	# have we found a Title??
434	$self->title_fallback($doc_obj,$topsection,$filename_no_path);
435
436	$self->add_OID($doc_obj);
437
438	return (1, $doc_obj);
439
440	}
441
442	sub process {
443	my $self = shift (@_);
444	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
445
446	return $self->process_type($base_dir, $file, $doc_obj);
447	}
448
449	# do plugin specific processing of doc_obj for doc_ext type
450	sub process_type {
451	my $self = shift (@_);
452	my ($base_dir, $file, $doc_obj) = @_;
453
454	# need to check that not empty
455	my ($doc_ext) = $file =~ /\.(\w+)$/;
456	$doc_ext = lc($doc_ext);
457	my $file_type = "unknown";
458	$file_type = $self->{'file_type'} if defined $self->{'file_type'};
459
460	# associate original file with doc object
461	my $cursection = $doc_obj->get_top_section();
462	my $filename = &util::filename_cat($base_dir, $file);
463	my $assocfilename = "doc.$doc_ext";
464	if ($self->{'keep_original_filename'} == 1) {
465	# this should be the same filename that was used for the Source and SourceFile metadata,
466	# as we will use SourceFile in the srclink (below)
467	$assocfilename = $doc_obj->get_assocfile_from_sourcefile();
468	}
469
470	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
471
472	# We use set instead of add here because we only want one value
473	$doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
474	my $srclink_filename = "doc.$doc_ext";
475	if ($self->{'keep_original_filename'} == 1) {
476	$srclink_filename = $doc_obj->get_sourcefile();
477	}
478	# srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
479	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
480	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
481	$doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $srclink_filename);
482	return 1;
483	}
484
485	sub clean_up_after_doc_obj_processing {
486	my $self = shift(@_);
487
488	my $tmp_dir = $self->{'tmp_dir'};
489	if (defined $tmp_dir && -d $tmp_dir) {
490	print STDERR "**** Supressing clean up of tmp dir\n";
491	##&util::rm_r($tmp_dir);
492	$self->{'tmp_dir'} = undef;
493	}
494
495
496	}
497	1;
498
499
500
501
502
503
504

Note: See TracBrowser for help on using the repository browser.

Download in other formats: