Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm@ 21760

Last change on this file since 21760 was 21760, checked in by kjdon, 14 years ago
srclink now generated dynamically at runtime. instead of storing srclink metadata, we store srclink_file metadata, which can be a value (doc.doc) or a metadata format element (eg [SourceFile]).
Property svn:keywords set to `Author Date Id Revision`
File size: 18.1 KB

Line
1	###########################################################################
2	#
3	# ConvertBinaryFile.pm -- plugin that facilitates conversion of binary files
4	# through gsConvert.pl
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin,
29	# PostScriptPlugin,
30	# RTFPlugin and PDFPlugin. It facilitates the conversion of these document types
31	# to either HTML, Text or a series of images. It works by dynamically loading
32	# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
33	# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
34
35	package ConvertBinaryFile;
36
37	use AutoExtractMetadata;
38	use ghtml;
39	use HTMLPlugin;
40	use TextPlugin;
41	use PagedImagePlugin;
42
43	use strict;
44	no strict 'refs'; # allow filehandles to be variables and viceversa
45	no strict 'subs';
46
47	sub BEGIN {
48	@ConvertBinaryFile::ISA = ('AutoExtractMetadata');
49	}
50
51	my $convert_to_list =
52	[ { 'name' => "auto",
53	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
54	{ 'name' => "html",
55	'desc' => "{ConvertBinaryFile.convert_to.html}" },
56	{ 'name' => "text",
57	'desc' => "{ConvertBinaryFile.convert_to.text}" }
58	];
59
60	my $arguments =
61	[ { 'name' => "convert_to",
62	'desc' => "{ConvertBinaryFile.convert_to}",
63	'type' => "enum",
64	'reqd' => "yes",
65	'list' => $convert_to_list,
66	'deft' => "auto" },
67	{ 'name' => "keep_original_filename",
68	'desc' => "{ConvertBinaryFile.keep_original_filename}",
69	'type' => "flag" },
70	{ 'name' => "title_sub",
71	'desc' => "{HTMLPlugin.title_sub}",
72	'type' => "string",
73	#'type' => "regexp",
74	'deft' => "" },
75	{ 'name' => "apply_fribidi",
76	'desc' => "{ConvertBinaryFile.apply_fribidi}",
77	'type' => "flag",
78	'reqd' => "no" },
79	{ 'name' => "use_strings",
80	'desc' => "{ConvertBinaryFile.use_strings}",
81	'type' => "flag",
82	'reqd' => "no" },
83	];
84
85	my $options = { 'name' => "ConvertBinaryFile",
86	'desc' => "{ConvertBinaryFile.desc}",
87	'abstract' => "yes",
88	'inherits' => "yes",
89	'args' => $arguments };
90
91
92	sub load_secondary_plugins
93	{
94	my $self = shift (@_);
95	my ($class,$input_args,$hashArgOptLists) = @_;
96
97	my @convert_to_list = split(",",$self->{'convert_to'});
98	my $secondary_plugins = {};
99	# find the plugin
100
101	foreach my $convert_to (@convert_to_list) {
102	# load in "convert_to" plugin package
103	my $plugin_class = $convert_to."Plugin";
104	my $plugin_package = $plugin_class.".pm";
105
106	my $colplugname = undef;
107	if (defined $ENV{'GSDLCOLLECTDIR'}) {
108	$colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
109	"perllib","plugins",
110	$plugin_package);
111	}
112
113	my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
114	"perllib","plugins",
115	$plugin_package);
116
117	if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
118	elsif (-e $mainplugname) { require $mainplugname; }
119	else {
120	&gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
121	$plugin_class);
122	die "\n";
123	}
124
125	# call its constructor with extra options that we've worked out!
126	my $arglist = $input_args->{$plugin_class};
127
128	my ($secondary_plugin);
129	eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
130	die "$@" if $@;
131	$secondary_plugins->{$plugin_class} = $secondary_plugin;
132	}
133	$self->{'secondary_plugins'} = $secondary_plugins;
134	}
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140	my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
145
146	if ($self->{'info_only'}) {
147	# don't worry about any options etc
148	return bless $self, $class;
149	}
150
151	my $convert_to_type = $self->{'convert_to'};
152	if (!defined $convert_to_type \|\| $convert_to_type eq "") {
153	$convert_to_type = "auto";
154	}
155	my $windows_scripting = $self->{'windows_scripting'};
156	$windows_scripting = 0 unless defined $windows_scripting;
157	if ($classPluginName eq "PDFPlugin") {
158	if ($convert_to_type eq "text" &&
159	$ENV{'GSDLOS'} =~ /^windows$/i) {
160	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
161	$convert_to_type = "html";
162	}
163	} elsif ($classPluginName eq "WordPlugin") {
164	if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html\|auto)$/) {
165	# we use structured HTML, not normal html
166	$convert_to_type = "structuredhtml";
167	}
168	} elsif ($classPluginName eq "PowerPointPlugin") {
169	if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
170	# we use paged img
171	$convert_to_type = "pagedimg_jpg";
172	}
173	} elsif ($classPluginName eq "PostScriptPlugin") {
174	if ($convert_to_type eq "auto") {
175	# we use text
176	$convert_to_type = "text";
177	}
178	}
179
180	if ($convert_to_type eq "auto") {
181	# choose html for now - should choose a format based on doc type
182	$convert_to_type = "html";
183	}
184
185	if ($convert_to_type eq "html") {
186	$self->{'convert_to'} = "HTML";
187	$self->{'convert_to_ext'} = "html";
188	} elsif ($convert_to_type eq "text") {
189	$self->{'convert_to'} = "Text";
190	$self->{'convert_to_ext'} = "txt";
191	} elsif ($convert_to_type eq "structuredhtml") {
192	$self->{'convert_to'} = "StructuredHTML";
193	$self->{'convert_to_ext'} = "html";
194	} elsif ($convert_to_type =~ /^pagedimg/) {
195	$self->{'convert_to'} = "PagedImage";
196	my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg\|gif\|png)/i;
197	$convert_to_ext = 'jpg' unless defined $convert_to_ext;
198	$self->{'convert_to_ext'} = $convert_to_ext;
199	}
200
201	return bless $self, $class;
202	}
203
204
205	sub init {
206	my $self = shift (@_);
207	my ($verbosity, $outhandle, $failhandle) = @_;
208
209	$self->SUPER::init($verbosity,$outhandle,$failhandle);
210
211	my $secondary_plugins = $self->{'secondary_plugins'};
212
213	foreach my $plug_name (keys %$secondary_plugins) {
214	my $plugin = $secondary_plugins->{$plug_name};
215	$plugin->init($verbosity,$outhandle,$failhandle);
216	}
217	}
218
219	sub deinit {
220	# called only once, after all plugin passes have been done
221
222	my ($self) = @_;
223
224	my $secondary_plugins = $self->{'secondary_plugins'};
225
226	foreach my $plug_name (keys %$secondary_plugins) {
227	my $plugin = $secondary_plugins->{$plug_name};
228	$plugin->deinit();
229	}
230	}
231
232	sub convert_post_process
233	{
234	# by default do no post processing
235	return;
236	}
237
238
239	# Run conversion utility on the input file.
240	#
241	# The conversion takes place in a collection specific 'tmp' directory so
242	# that we don't accidentally damage the input.
243	#
244	# The desired output type is indicated by $output_ext. This is usually
245	# something like "html" or "word", but can be "best" (or the empty string)
246	# to indicate that the conversion utility should do the best it can.
247	sub tmp_area_convert_file {
248	my $self = shift (@_);
249	my ($output_ext, $input_filename, $textref) = @_;
250
251	my $outhandle = $self->{'outhandle'};
252	my $convert_to = $self->{'convert_to'};
253	my $failhandle = $self->{'failhandle'};
254	my $convert_to_ext = $self->{'convert_to_ext'};
255
256	# derive tmp filename from input filename
257	my ($tailname, $dirname, $suffix)
258	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
259
260	# softlink to collection tmp dir
261	my $tmp_dirname = $dirname;
262	if(defined $ENV{'GSDLCOLLECTDIR'}) {
263	$tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
264	} elsif(defined $ENV{'GSDLHOME'}) {
265	$tmp_dirname = $ENV{'GSDLHOME'};
266	}
267	$tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
268	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
269
270	# The following is not necessary and will cause problems with
271	# replacing_srcdoc_with_html in the GSDLremote case:
272	# Remove any white space from filename -- no risk of name collision, and
273	# makes later conversion by utils simpler. Leave spaces in path...
274	# tidy up the filename with space, dot, hyphen between
275	#$tailname =~ s/\s+//g;
276	#$tailname =~ s/\.+//g;
277	#$tailname =~ s/\-+//g;
278
279	# convert to utf-8 otherwise we have problems with the doc.xml file later on
280	# print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
281	$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
282
283	# URLEncode this since htmls with images where the html filename is utf8 don't seem
284	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
285	# files on the filesystem.
286	$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
287
288	$suffix = lc($suffix);
289	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
290
291	# If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
292	# But we can't softlink to relative paths. Therefore, we need to ensure that
293	# the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
294	my $ensure_path_absolute = 1; # true
295	&util::soft_link($input_filename, $tmp_filename, $ensure_path_absolute);
296	my $verbosity = $self->{'verbosity'};
297	if ($verbosity > 0) {
298	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
299	}
300
301	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
302
303	# Execute the conversion command and get the type of the result,
304	# making sure the converter gives us the appropriate output type
305	my $output_type="";
306	if ($convert_to =~ m/PagedImage/i) {
307	$output_type = lc($convert_to)."_".lc($convert_to_ext);
308	} else {
309	$output_type = lc($convert_to);
310	}
311
312	my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
313	if (defined $self->{'convert_options'}) {
314	$cmd .= $self->{'convert_options'} . " ";
315	}
316	if ($self->{'use_strings'}) {
317	$cmd .= "-use_strings ";
318	}
319	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
320	$output_type = `$cmd`;
321
322	# remove symbolic link to original file
323	&util::rm($tmp_filename);
324
325	# Check STDERR here
326	chomp $output_type;
327	if ($output_type eq "fail") {
328	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
329	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
330	# The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
331	#$self->{'num_not_processed'} ++;
332	if (-s "$errlog") {
333	open(ERRLOG, "$errlog");
334	while (<ERRLOG>) {
335	print $outhandle "$_";
336	}
337	print $outhandle "\n";
338	close ERRLOG;
339	}
340	&util::rm("$errlog") if (-e "$errlog");
341	return "";
342	}
343
344	# store the actual output type and return the output filename
345	# it's possible we requested conversion to html, but only to text succeeded
346	#$self->{'convert_to_ext'} = $output_type;
347	if ($output_type =~ /html/i) {
348	$self->{'converted_to'} = "HTML";
349	} elsif ($output_type =~ /te?xt/i) {
350	$self->{'converted_to'} = "Text";
351	} elsif ($output_type =~ /item/i){
352	$self->{'converted_to'} = "PagedImage";
353	}
354
355	my $output_filename = $tmp_filename;
356	if ($output_type =~ /item/i) {
357	# running under windows
358	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
359	$output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
360	} else {
361	$output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
362	}
363	} else {
364	$output_filename =~ s/$suffix$/.$output_type/;
365	}
366
367	return $output_filename;
368	}
369
370
371	# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
372	sub read_into_doc_obj {
373	my $self = shift (@_);
374	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
375
376	my $outhandle = $self->{'outhandle'};
377
378	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
379
380	my $output_ext = $self->{'convert_to_ext'};
381	my $conv_filename = "";
382	$conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
383
384	if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
385	if (! -e "$conv_filename") {return -1;}
386	$self->{'conv_filename'} = $conv_filename;
387	$self->convert_post_process($conv_filename);
388
389	# Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
390	# Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
391	if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML\|Text)/) {
392	my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
393	if (system($fribidi_command) != 0) {
394	print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
395	}
396	else {
397	&util::mv("${conv_filename}.tmp", $conv_filename);
398	}
399	}
400
401	my $secondary_plugins = $self->{'secondary_plugins'};
402	my $num_secondary_plugins = scalar(keys %$secondary_plugins);
403
404	if ($num_secondary_plugins == 0) {
405	print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
406	return 0; # effectively block it
407	}
408
409	my @plugin_names = keys %$secondary_plugins;
410	my $plugin_name = shift @plugin_names;
411
412	if ($num_secondary_plugins > 1) {
413	print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
414	}
415
416	my $secondary_plugin = $secondary_plugins->{$plugin_name};
417
418	# note: metadata is not carried on to the next level
419	## **** I just replaced $metadata with {} in following
420	my ($rv,$doc_obj)
421	= $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
422
423	if ((!defined $rv) \|\| ($rv<1)) {
424	# wasn't processed
425	return $rv;
426	}
427
428	# Override previous gsdlsourcefilename set by secondary plugin
429	my $collect_file = &util::filename_within_collection($filename_full_path);
430	my $collect_conv_file = &util::filename_within_collection($conv_filename);
431	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
432	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
433	# build. so set it manually.
434	$doc_obj->{'source_path'} = $filename_full_path;
435	$doc_obj->set_converted_filename($collect_conv_file);
436
437	$self->set_Source_metadata($doc_obj, $filename_no_path);
438
439	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
440	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
441
442	# ****
443	my ($tailname, $dirname, $suffix)
444	= &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
445	$doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
446
447	# do plugin specific processing of doc_obj
448	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
449	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
450	return -1;
451	}
452
453	my $topsection = $doc_obj->get_top_section();
454	$self->add_associated_files($doc_obj, $filename_full_path);
455
456	# extra_metadata is already called by sec plugin in process??
457	$self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
458	# do any automatic metadata extraction
459	$self->auto_extract_metadata ($doc_obj);
460
461	# have we found a Title??
462	$self->title_fallback($doc_obj,$topsection,$filename_no_path);
463
464	$self->add_OID($doc_obj);
465
466	return (1, $doc_obj);
467
468	}
469
470	sub process {
471	my $self = shift (@_);
472	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
473
474	return $self->process_type($base_dir, $file, $doc_obj);
475	}
476
477	# do plugin specific processing of doc_obj for doc_ext type
478	sub process_type {
479	my $self = shift (@_);
480	my ($base_dir, $file, $doc_obj) = @_;
481
482	# need to check that not empty
483	my $doc_ext = $self->{'filename_extension'};
484	my $file_type = "unknown";
485	$file_type = $self->{'file_type'} if defined $self->{'file_type'};
486
487	# associate original file with doc object
488	my $cursection = $doc_obj->get_top_section();
489	my $filename = &util::filename_cat($base_dir, $file);
490	my $assocfilename = "doc.$doc_ext";
491	if ($self->{'keep_original_filename'} == 1) {
492	# this should be the same filename that was used for the Source and SourceFile metadata,
493	# as we will use [SourceFile] in the srclink
494	$assocfilename = $doc_obj->get_assocfile_from_sourcefile();
495	}
496	$doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
497
498	# We use set instead of add here because we only want one value
499	$doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
500	my $srclink_filename = "doc.$doc_ext";
501	#my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/doc.$doc_ext\">";
502	if ($self->{'keep_original_filename'} == 1) {
503	$srclink_filename = "[SourceFile]";
504	#$doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[SourceFile]\">";
505	}
506	#$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
507	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
508	#$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
509	$doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
510	return 1;
511	}
512
513	1;
514
515
516
517
518
519
520

Note: See TracBrowser for help on using the repository browser.

Download in other formats: