Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 23484

Last change on this file since 23484 was 23387, checked in by davidb, 13 years ago
Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.0 KB

Line
1	###########################################################################
2	#
3	# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4	# through gsConvert.pl
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29	# PostScriptPlugin,
30	# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31	# to either HTML, Text or a series of images. It works by dynamically loading
32	# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33	# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35	package ConvertBinaryFile;
36
37	use AutoExtractMetadata;
38	use ghtml;
39	use HTMLPlugin;
40	use TextPlugin;
41	use PagedImagePlugin;
42
43	use strict;
44	no strict 'refs'; # allow filehandles to be variables and viceversa
45	no strict 'subs';
46
47	sub BEGIN {
48	@ConvertBinaryFile::ISA = ('AutoExtractMetadata');
49	}
50
51	my $convert_to_list =
52	[ { 'name' => "auto",
53	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
54	{ 'name' => "html",
55	'desc' => "{ConvertBinaryFile.convert_to.html}" },
56	{ 'name' => "text",
57	'desc' => "{ConvertBinaryFile.convert_to.text}" }
58	];
59
60	my $arguments =
61	[ { 'name' => "convert_to",
62	'desc' => "{ConvertBinaryFile.convert_to}",
63	'type' => "enum",
64	'reqd' => "yes",
65	'list' => $convert_to_list,
66	'deft' => "auto" },
67	{ 'name' => "keep_original_filename",
68	'desc' => "{ConvertBinaryFile.keep_original_filename}",
69	'type' => "flag" },
70	{ 'name' => "title_sub",
71	'desc' => "{HTMLPlugin.title_sub}",
72	'type' => "string",
73	#'type' => "regexp",
74	'deft' => "" },
75	{ 'name' => "apply_fribidi",
76	'desc' => "{ConvertBinaryFile.apply_fribidi}",
77	'type' => "flag",
78	'reqd' => "no" },
79	{ 'name' => "use_strings",
80	'desc' => "{ConvertBinaryFile.use_strings}",
81	'type' => "flag",
82	'reqd' => "no" },
83	];
84
85	my $options = { 'name' => "ConvertBinaryFile",
86	'desc' => "{ConvertBinaryFile.desc}",
87	'abstract' => "yes",
88	'inherits' => "yes",
89	'args' => $arguments };
90
91
92	sub load_secondary_plugins
93	{
94	my $self = shift (@_);
95	my ($class,$input_args,$hashArgOptLists) = @_;
96
97	my @convert_to_list = split(",",$self->{'convert_to_plugin'});
98	my $secondary_plugins = {};
99	# find the plugin
100
101	foreach my $convert_to (@convert_to_list) {
102	# load in "convert_to" plugin package
103	my $plugin_class = $convert_to;
104	my $plugin_package = $plugin_class.".pm";
105
106	my $colplugname = undef;
107	if (defined $ENV{'GSDLCOLLECTDIR'}) {
108	$colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109	"perllib","plugins",
110	$plugin_package);
111	}
112
113	my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
114	"perllib","plugins",
115	$plugin_package);
116
117	if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
118	elsif (-e $mainplugname) { require $mainplugname; }
119	else {
120	&gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121	$plugin_class);
122	die "\n";
123	}
124
125	# call its constructor with extra options that we've worked out!
126	my $arglist = $input_args->{$plugin_class};
127
128	my ($secondary_plugin);
129	eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130	die "$@" if $@;
131	$secondary_plugins->{$plugin_class} = $secondary_plugin;
132	}
133	$self->{'secondary_plugins'} = $secondary_plugins;
134	}
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140	my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
145
146	return bless $self, $class;
147	}
148
149	# should be called by subclasses after checking and setting
150	# $self->{'convert_to'}
151	sub set_standard_convert_settings {
152	my $self =shift (@_);
153
154	my $convert_to = $self->{'convert_to'};
155	if ($convert_to eq "auto") {
156	$convert_to = "html";
157	$self->{'convert_to'} = "html";
158	}
159
160	if ($convert_to =~ /^html/) { # may be html or html_multi
161	$self->{'convert_to_plugin'} = "HTMLPlugin";
162	$self->{'convert_to_ext'} = "html";
163	} elsif ($convert_to eq "text") {
164	$self->{'convert_to_plugin'} = "TextPlugin";
165	$self->{'convert_to_ext'} = "txt";
166	} elsif ($convert_to eq "structuredhtml") {
167	$self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
168	$self->{'convert_to_ext'} = "html";
169	} elsif ($convert_to =~ /^pagedimg/) {
170	$self->{'convert_to_plugin'} = "PagedImagePlugin";
171	my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg\|gif\|png)/i;
172	$convert_to_ext = 'jpg' unless defined $convert_to_ext;
173	$self->{'convert_to_ext'} = $convert_to_ext;
174	}
175
176	}
177	sub init {
178	my $self = shift (@_);
179	my ($verbosity, $outhandle, $failhandle) = @_;
180
181	$self->SUPER::init($verbosity,$outhandle,$failhandle);
182
183	my $secondary_plugins = $self->{'secondary_plugins'};
184
185	foreach my $plug_name (keys %$secondary_plugins) {
186	my $plugin = $secondary_plugins->{$plug_name};
187	$plugin->init($verbosity,$outhandle,$failhandle);
188	}
189	}
190
191	sub deinit {
192	# called only once, after all plugin passes have been done
193
194	my ($self) = @_;
195
196	my $secondary_plugins = $self->{'secondary_plugins'};
197
198	foreach my $plug_name (keys %$secondary_plugins) {
199	my $plugin = $secondary_plugins->{$plug_name};
200	$plugin->deinit();
201	}
202	}
203
204	sub convert_post_process
205	{
206	# by default do no post processing
207	return;
208	}
209
210
211	# Run conversion utility on the input file.
212	#
213	# The conversion takes place in a collection specific 'tmp' directory so
214	# that we don't accidentally damage the input.
215	#
216	# The desired output type is indicated by $output_ext. This is usually
217	# something like "html" or "word", but can be "best" (or the empty string)
218	# to indicate that the conversion utility should do the best it can.
219	sub tmp_area_convert_file {
220	my $self = shift (@_);
221	my ($output_ext, $input_filename, $textref) = @_;
222
223	my $outhandle = $self->{'outhandle'};
224	my $convert_to = $self->{'convert_to'};
225	my $failhandle = $self->{'failhandle'};
226	my $convert_to_ext = $self->{'convert_to_ext'};
227
228
229	my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
230
231	# derive tmp filename from input filename
232	my ($tailname, $dirname, $suffix)
233	= &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
234
235	# softlink to collection tmp dir
236	my $tmp_dirname = &util::get_timestamped_tmp_folder();
237	if (defined $tmp_dirname) {
238	$self->{'tmp_dir'} = $tmp_dirname;
239	} else {
240	$tmp_dirname = $dirname;
241	}
242
243	# # convert to utf-8 otherwise we have problems with the doc.xml file later on
244	# my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
245
246	# make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
247	my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
248
249
250	# URLEncode this since htmls with images where the html filename is utf8 don't seem
251	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
252	# files on the filesystem.
253	$utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
254
255	my $lc_suffix = lc($suffix);
256	my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
257
258	# If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
259	# But we can't softlink to relative paths. Therefore, we need to ensure that
260	# the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
261	my $ensure_path_absolute = 1; # true
262	&util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
263	my $verbosity = $self->{'verbosity'};
264	if ($verbosity > 0) {
265	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
266	}
267
268	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
269
270	# Execute the conversion command and get the type of the result,
271	# making sure the converter gives us the appropriate output type
272	my $output_type=$self->{'convert_to'};
273	# if ($convert_to =~ m/PagedImage/i) {
274	# $output_type = lc($convert_to)."_".lc($convert_to_ext);
275	# } else {
276	# $output_type = lc($convert_to);
277	# }
278
279	my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
280	if (defined $self->{'convert_options'}) {
281	$cmd .= $self->{'convert_options'} . " ";
282	}
283	if ($self->{'use_strings'}) {
284	$cmd .= "-use_strings ";
285	}
286	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
287	print STDERR "calling cmd $cmd\n";
288	$output_type = `$cmd`;
289
290	# remove symbolic link to original file
291	&util::rm($tmp_filename);
292
293	# Check STDERR here
294	chomp $output_type;
295	if ($output_type eq "fail") {
296	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
297	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
298	# The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
299	#$self->{'num_not_processed'} ++;
300	if (-s "$errlog") {
301	open(ERRLOG, "$errlog");
302	while (<ERRLOG>) {
303	print $outhandle "$_";
304	}
305	print $outhandle "\n";
306	close ERRLOG;
307	}
308	&util::rm("$errlog") if (-e "$errlog");
309	return "";
310	}
311
312	# store the actual output type and return the output filename
313	# it's possible we requested conversion to html, but only to text succeeded
314	#$self->{'convert_to_ext'} = $output_type;
315	if ($output_type =~ /html/i) {
316	$self->{'converted_to'} = "HTML";
317	} elsif ($output_type =~ /te?xt/i) {
318	$self->{'converted_to'} = "Text";
319	} elsif ($output_type =~ /item/i){
320	$self->{'converted_to'} = "PagedImage";
321	}
322
323	my $output_filename = $tmp_filename;
324	if ($output_type =~ /item/i) {
325	# running under windows
326	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
327	$output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
328	} else {
329	$output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
330	}
331	} else {
332	$output_filename =~ s/$lc_suffix$/.$output_type/;
333	}
334
335	return $output_filename;
336	}
337
338
339	# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
340	sub read_into_doc_obj {
341	my $self = shift (@_);
342	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
343
344	my $outhandle = $self->{'outhandle'};
345
346	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
347
348	my $output_ext = $self->{'convert_to_ext'};
349	my $conv_filename = "";
350	$conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
351
352	if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
353	if (! -e "$conv_filename") {return -1;}
354	$self->{'conv_filename'} = $conv_filename;
355	$self->convert_post_process($conv_filename);
356
357	# Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
358	# Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
359	if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML\|Text)/) {
360	my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
361	if (system($fribidi_command) != 0) {
362	print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
363	}
364	else {
365	&util::mv("${conv_filename}.tmp", $conv_filename);
366	}
367	}
368
369	my $secondary_plugins = $self->{'secondary_plugins'};
370	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
371
372	if ($num_secondary_plugins == 0) {
373	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
374	return 0; # effectively block it
375	}
376
377	my @plugin_names = keys %$secondary_plugins;
378	my $plugin_name = shift @plugin_names;
379
380	if ($num_secondary_plugins > 1) {
381	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
382	}
383
384	my $secondary_plugin = $secondary_plugins->{$plugin_name};
385
386	# note: metadata is not carried on to the next level
387	## **** I just replaced $metadata with {} in following
388	my ($rv,$doc_obj)
389	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
390
391	if ((!defined $rv) \|\| ($rv<1)) {
392	# wasn't processed
393	return $rv;
394	}
395
396	# Override previous gsdlsourcefilename set by secondary plugin
397	my $collect_file = &util::filename_within_collection($filename_full_path);
398	my $collect_conv_file = &util::filename_within_collection($conv_filename);
399	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
400	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
401	# build. so set it manually.
402	$doc_obj->set_source_path($filename_full_path);
403	$doc_obj->set_converted_filename($collect_conv_file);
404
405	my $plugin_filename_encoding = $self->{'filename_encoding'};
406	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
407	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
408
409	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
410	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
411
412	# ****
413	my ($tailname, $dirname, $suffix)
414	= &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
415	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
416
417	# do plugin specific processing of doc_obj
418	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
419	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
420	return -1;
421	}
422
423	my $topsection = $doc_obj->get_top_section();
424	$self->add_associated_files($doc_obj, $filename_full_path);
425
426	# extra_metadata is already called by sec plugin in process??
427	$self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
428	# do any automatic metadata extraction
429	$self->auto_extract_metadata ($doc_obj);
430
431	# have we found a Title??
432	$self->title_fallback($doc_obj,$topsection,$filename_no_path);
433
434	$self->add_OID($doc_obj);
435
436	return (1, $doc_obj);
437
438	}
439
440	sub process {
441	my $self = shift (@_);
442	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
443
444	return $self->process_type($base_dir, $file, $doc_obj);
445	}
446
447	# do plugin specific processing of doc_obj for doc_ext type
448	sub process_type {
449	my $self = shift (@_);
450	my ($base_dir, $file, $doc_obj) = @_;
451
452	# need to check that not empty
453	my ($doc_ext) = $file =~ /\.(\w+)$/;
454	$doc_ext = lc($doc_ext);
455	my $file_type = "unknown";
456	$file_type = $self->{'file_type'} if defined $self->{'file_type'};
457
458	# associate original file with doc object
459	my $cursection = $doc_obj->get_top_section();
460	my $filename = &util::filename_cat($base_dir, $file);
461	my $assocfilename = "doc.$doc_ext";
462	if ($self->{'keep_original_filename'} == 1) {
463	# this should be the same filename that was used for the Source and SourceFile metadata,
464	# as we will use SourceFile in the srclink (below)
465	$assocfilename = $doc_obj->get_assocfile_from_sourcefile();
466	}
467
468	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
469
470	# We use set instead of add here because we only want one value
471	$doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
472	my $srclink_filename = "doc.$doc_ext";
473	if ($self->{'keep_original_filename'} == 1) {
474	$srclink_filename = $doc_obj->get_sourcefile();
475	}
476	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
477	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
478	return 1;
479	}
480
481	sub clean_up_after_doc_obj_processing {
482	my $self = shift(@_);
483
484	my $tmp_dir = $self->{'tmp_dir'};
485	if (defined $tmp_dir && -d $tmp_dir) {
486	## print STDERR "**** Supressing clean up of tmp dir\n";
487	&util::rm_r($tmp_dir);
488	$self->{'tmp_dir'} = undef;
489	}
490
491
492	}
493	1;
494
495
496
497
498
499
500

Note: See TracBrowser for help on using the repository browser.

Download in other formats: