Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago
Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)
Property svn:keywords set to `Author Date Id Revision`
File size: 16.7 KB

Line
1	###########################################################################
2	#
3	# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4	# through gsConvert.pl
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29	# PostScriptPlugin,
30	# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31	# to either HTML, Text or a series of images. It works by dynamically loading
32	# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33	# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35	package ConvertBinaryFile;
36
37	use AutoExtractMetadata;
38	use ghtml;
39	use HTMLPlugin;
40	use TextPlugin;
41	use PagedImagePlugin;
42
43	use strict;
44	no strict 'refs'; # allow filehandles to be variables and viceversa
45	no strict 'subs';
46
47	sub BEGIN {
48	@ConvertBinaryFile::ISA = ('AutoExtractMetadata');
49	}
50
51	my $convert_to_list =
52	[ { 'name' => "auto",
53	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
54	{ 'name' => "html",
55	'desc' => "{ConvertBinaryFile.convert_to.html}" },
56	{ 'name' => "text",
57	'desc' => "{ConvertBinaryFile.convert_to.text}" }
58	];
59
60	my $arguments =
61	[ { 'name' => "convert_to",
62	'desc' => "{ConvertBinaryFile.convert_to}",
63	'type' => "enum",
64	'reqd' => "yes",
65	'list' => $convert_to_list,
66	'deft' => "auto" },
67	{ 'name' => "keep_original_filename",
68	'desc' => "{ConvertBinaryFile.keep_original_filename}",
69	'type' => "flag" },
70	{ 'name' => "title_sub",
71	'desc' => "{HTMLPlugin.title_sub}",
72	'type' => "string",
73	#'type' => "regexp",
74	'deft' => "" },
75	{ 'name' => "apply_fribidi",
76	'desc' => "{ConvertBinaryFile.apply_fribidi}",
77	'type' => "flag",
78	'reqd' => "no" },
79	{ 'name' => "use_strings",
80	'desc' => "{ConvertBinaryFile.use_strings}",
81	'type' => "flag",
82	'reqd' => "no" },
83	];
84
85	my $options = { 'name' => "ConvertBinaryFile",
86	'desc' => "{ConvertBinaryFile.desc}",
87	'abstract' => "yes",
88	'inherits' => "yes",
89	'args' => $arguments };
90
91
92	sub load_secondary_plugins
93	{
94	my $self = shift (@_);
95	my ($class,$input_args,$hashArgOptLists) = @_;
96
97	my @convert_to_list = split(",",$self->{'convert_to_plugin'});
98	my $secondary_plugins = {};
99	# find the plugin
100
101	foreach my $convert_to (@convert_to_list) {
102	# load in "convert_to" plugin package
103	my $plugin_class = $convert_to;
104	my $plugin_package = $plugin_class.".pm";
105
106	my $colplugname = undef;
107	if (defined $ENV{'GSDLCOLLECTDIR'}) {
108	$colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109	"perllib","plugins",
110	$plugin_package);
111	}
112
113	my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
114	"perllib","plugins",
115	$plugin_package);
116
117	if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
118	elsif (-e $mainplugname) { require $mainplugname; }
119	else {
120	&gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121	$plugin_class);
122	die "\n";
123	}
124
125	# call its constructor with extra options that we've worked out!
126	my $arglist = $input_args->{$plugin_class};
127
128	my ($secondary_plugin);
129	eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130	die "$@" if $@;
131	$secondary_plugins->{$plugin_class} = $secondary_plugin;
132	}
133	$self->{'secondary_plugins'} = $secondary_plugins;
134	}
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140	my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
145
146	return bless $self, $class;
147	}
148
149	# should be called by subclasses after checking and setting
150	# $self->{'convert_to'}
151	sub set_standard_convert_settings {
152	my $self =shift (@_);
153
154	my $convert_to = $self->{'convert_to'};
155	if ($convert_to eq "auto") {
156	$convert_to = "html";
157	$self->{'convert_to'} = "html";
158	}
159
160	if ($convert_to =~ /^html/) { # may be html or html_multi
161	$self->{'convert_to_plugin'} = "HTMLPlugin";
162	$self->{'convert_to_ext'} = "html";
163	} elsif ($convert_to eq "text") {
164	$self->{'convert_to_plugin'} = "TextPlugin";
165	$self->{'convert_to_ext'} = "txt";
166	} elsif ($convert_to eq "structuredhtml") {
167	$self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
168	$self->{'convert_to_ext'} = "html";
169	} elsif ($convert_to =~ /^pagedimg/) {
170	$self->{'convert_to_plugin'} = "PagedImagePlugin";
171	my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg\|gif\|png)/i;
172	$convert_to_ext = 'jpg' unless defined $convert_to_ext;
173	$self->{'convert_to_ext'} = $convert_to_ext;
174	}
175
176	}
177	sub init {
178	my $self = shift (@_);
179	my ($verbosity, $outhandle, $failhandle) = @_;
180
181	$self->SUPER::init($verbosity,$outhandle,$failhandle);
182
183	my $secondary_plugins = $self->{'secondary_plugins'};
184
185	foreach my $plug_name (keys %$secondary_plugins) {
186	my $plugin = $secondary_plugins->{$plug_name};
187	$plugin->init($verbosity,$outhandle,$failhandle);
188	}
189	}
190
191	sub deinit {
192	# called only once, after all plugin passes have been done
193
194	my ($self) = @_;
195
196	my $secondary_plugins = $self->{'secondary_plugins'};
197
198	foreach my $plug_name (keys %$secondary_plugins) {
199	my $plugin = $secondary_plugins->{$plug_name};
200	$plugin->deinit();
201	}
202	}
203
204	sub convert_post_process
205	{
206	# by default do no post processing
207	return;
208	}
209
210
211	# Run conversion utility on the input file.
212	#
213	# The conversion takes place in a collection specific 'tmp' directory so
214	# that we don't accidentally damage the input.
215	#
216	# The desired output type is indicated by $output_ext. This is usually
217	# something like "html" or "word", but can be "best" (or the empty string)
218	# to indicate that the conversion utility should do the best it can.
219	sub tmp_area_convert_file {
220	my $self = shift (@_);
221	my ($output_ext, $input_filename, $textref) = @_;
222
223	my $outhandle = $self->{'outhandle'};
224	my $convert_to = $self->{'convert_to'};
225	my $failhandle = $self->{'failhandle'};
226	my $convert_to_ext = $self->{'convert_to_ext'};
227
228	# derive tmp filename from input filename
229	my ($tailname, $dirname, $suffix)
230	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
231
232	# softlink to collection tmp dir
233	my $tmp_dirname = &util::get_timestamped_tmp_folder();
234	if (defined $tmp_dirname) {
235	$self->{'tmp_dir'} = $tmp_dirname;
236	} else {
237	$tmp_dirname = $dirname;
238	}
239
240	# convert to utf-8 otherwise we have problems with the doc.xml file later on
241	# print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
242	$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
243
244	# URLEncode this since htmls with images where the html filename is utf8 don't seem
245	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
246	# files on the filesystem.
247	$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
248
249	$suffix = lc($suffix);
250	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
251
252	# If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
253	# But we can't softlink to relative paths. Therefore, we need to ensure that
254	# the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
255	my $ensure_path_absolute = 1; # true
256	&util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
257	my $verbosity = $self->{'verbosity'};
258	if ($verbosity > 0) {
259	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
260	}
261
262	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
263
264	# Execute the conversion command and get the type of the result,
265	# making sure the converter gives us the appropriate output type
266	my $output_type=$self->{'convert_to'};
267	# if ($convert_to =~ m/PagedImage/i) {
268	# $output_type = lc($convert_to)."_".lc($convert_to_ext);
269	# } else {
270	# $output_type = lc($convert_to);
271	# }
272
273	my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
274	if (defined $self->{'convert_options'}) {
275	$cmd .= $self->{'convert_options'} . " ";
276	}
277	if ($self->{'use_strings'}) {
278	$cmd .= "-use_strings ";
279	}
280	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
281	print STDERR "calling cmd $cmd\n";
282	$output_type = `$cmd`;
283
284	# remove symbolic link to original file
285	&util::rm($tmp_filename);
286
287	# Check STDERR here
288	chomp $output_type;
289	if ($output_type eq "fail") {
290	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
291	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
292	# The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
293	#$self->{'num_not_processed'} ++;
294	if (-s "$errlog") {
295	open(ERRLOG, "$errlog");
296	while (<ERRLOG>) {
297	print $outhandle "$_";
298	}
299	print $outhandle "\n";
300	close ERRLOG;
301	}
302	&util::rm("$errlog") if (-e "$errlog");
303	return "";
304	}
305
306	# store the actual output type and return the output filename
307	# it's possible we requested conversion to html, but only to text succeeded
308	#$self->{'convert_to_ext'} = $output_type;
309	if ($output_type =~ /html/i) {
310	$self->{'converted_to'} = "HTML";
311	} elsif ($output_type =~ /te?xt/i) {
312	$self->{'converted_to'} = "Text";
313	} elsif ($output_type =~ /item/i){
314	$self->{'converted_to'} = "PagedImage";
315	}
316
317	my $output_filename = $tmp_filename;
318	if ($output_type =~ /item/i) {
319	# running under windows
320	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
321	$output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
322	} else {
323	$output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
324	}
325	} else {
326	$output_filename =~ s/$suffix$/.$output_type/;
327	}
328
329	return $output_filename;
330	}
331
332
333	# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
334	sub read_into_doc_obj {
335	my $self = shift (@_);
336	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
337
338	my $outhandle = $self->{'outhandle'};
339
340	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
341
342	my $output_ext = $self->{'convert_to_ext'};
343	my $conv_filename = "";
344	$conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
345
346	if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
347	if (! -e "$conv_filename") {return -1;}
348	$self->{'conv_filename'} = $conv_filename;
349	$self->convert_post_process($conv_filename);
350
351	# Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
352	# Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
353	if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML\|Text)/) {
354	my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
355	if (system($fribidi_command) != 0) {
356	print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
357	}
358	else {
359	&util::mv("${conv_filename}.tmp", $conv_filename);
360	}
361	}
362
363	my $secondary_plugins = $self->{'secondary_plugins'};
364	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
365
366	if ($num_secondary_plugins == 0) {
367	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
368	return 0; # effectively block it
369	}
370
371	my @plugin_names = keys %$secondary_plugins;
372	my $plugin_name = shift @plugin_names;
373
374	if ($num_secondary_plugins > 1) {
375	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
376	}
377
378	my $secondary_plugin = $secondary_plugins->{$plugin_name};
379
380	# note: metadata is not carried on to the next level
381	## **** I just replaced $metadata with {} in following
382	my ($rv,$doc_obj)
383	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
384
385	if ((!defined $rv) \|\| ($rv<1)) {
386	# wasn't processed
387	return $rv;
388	}
389
390	# Override previous gsdlsourcefilename set by secondary plugin
391	my $collect_file = &util::filename_within_collection($filename_full_path);
392	my $collect_conv_file = &util::filename_within_collection($conv_filename);
393	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
394	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
395	# build. so set it manually.
396	$doc_obj->set_source_path($filename_full_path);
397	$doc_obj->set_converted_filename($collect_conv_file);
398
399	my $plugin_filename_encoding = $self->{'filename_encoding'};
400	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
401	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
402
403	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
404	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
405
406	# ****
407	my ($tailname, $dirname, $suffix)
408	= &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
409	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
410
411	# do plugin specific processing of doc_obj
412	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
413	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
414	return -1;
415	}
416
417	my $topsection = $doc_obj->get_top_section();
418	$self->add_associated_files($doc_obj, $filename_full_path);
419
420	# extra_metadata is already called by sec plugin in process??
421	$self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
422	# do any automatic metadata extraction
423	$self->auto_extract_metadata ($doc_obj);
424
425	# have we found a Title??
426	$self->title_fallback($doc_obj,$topsection,$filename_no_path);
427
428	$self->add_OID($doc_obj);
429
430	return (1, $doc_obj);
431
432	}
433
434	sub process {
435	my $self = shift (@_);
436	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
437
438	return $self->process_type($base_dir, $file, $doc_obj);
439	}
440
441	# do plugin specific processing of doc_obj for doc_ext type
442	sub process_type {
443	my $self = shift (@_);
444	my ($base_dir, $file, $doc_obj) = @_;
445
446	# need to check that not empty
447	my ($doc_ext) = $file =~ /\.(\w+)$/;
448	my $file_type = "unknown";
449	$file_type = $self->{'file_type'} if defined $self->{'file_type'};
450
451	# associate original file with doc object
452	my $cursection = $doc_obj->get_top_section();
453	my $filename = &util::filename_cat($base_dir, $file);
454	my $assocfilename = "doc.$doc_ext";
455	if ($self->{'keep_original_filename'} == 1) {
456	# this should be the same filename that was used for the Source and SourceFile metadata,
457	# as we will use SourceFile in the srclink (below)
458	$assocfilename = $doc_obj->get_assocfile_from_sourcefile();
459	}
460	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
461
462	# We use set instead of add here because we only want one value
463	$doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
464	my $srclink_filename = "doc.$doc_ext";
465	if ($self->{'keep_original_filename'} == 1) {
466	$srclink_filename = $doc_obj->get_sourcefile();
467	}
468	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
469	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
470	return 1;
471	}
472
473	sub clean_up_after_doc_obj_processing {
474	my $self = shift(@_);
475
476	my $tmp_dir = $self->{'tmp_dir'};
477	if (defined $tmp_dir && -d $tmp_dir) {
478	&util::rm_r($tmp_dir);
479	$self->{'tmp_dir'} = undef;
480	}
481
482
483	}
484	1;
485
486
487
488
489
490
491

Note: See TracBrowser for help on using the repository browser.

Download in other formats: