Context Navigation

source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 2755

Last change on this file since 2755 was 2755, checked in by jrm21, 23 years ago
import.pl now takes an option for saving file conversion failures to a log. By default, import.pl will use <collectdir>/etc/fail.log. Currently only the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.3 KB

Line
1	###########################################################################
2	#
3	# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4	# on plugin argument convert_to
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29	# It facilitates the conversion of these document types to either HTML
30	# or TEXT by setting up variable that instruct ConvertToBasPlug
31	# how to work.
32
33	# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34	# the plugin argument 'convert_to'. If the argument is not present,
35	# the default is to inherit HTMLPlug.
36
37
38	package ConvertToPlug;
39
40	use BasPlug;
41	use HTMLPlug;
42	use TEXTPlug;
43	use ghtml;
44
45	sub BEGIN {
46	@ISA = ('HTMLPlug');
47	# @ISA = ('HTMLPlug', 'TEXTPlug');
48	# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49	}
50
51	# use strict; # this breaks 'print $outhandle ' error msgs.
52
53	sub print_usage {
54	my ($plugin_name) = @_;
55
56	# for when this function is called directly by pluginfo.pl
57	if (ref ($plugin_name)) {
58	$plugin_name = ref ($plugin_name);
59	}
60
61	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
62	print STDERR " options:\n";
63	print STDERR " -convert_to (html\|text) plugin converts to TEXT or HTML\n";
64	print STDERR " (default html)\n";
65	}
66
67	sub parse_args
68	{
69	my $class = shift (@_);
70	my ($args) = @_;
71
72	my $plugin_name = $class;
73	$plugin_name =~ s/\.pm$//;
74
75	my $generate_format;
76	my $kea_arg;
77
78	if (!parsargv::parse($args,
79	q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
80	q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
81	q^convert_to/(html\|text)/html^, \$generate_format,
82	"allow_extra_options")) {
83
84	print STDERR "\nIncorrect options passed to $plugin_name, ";
85	print STDERR "check your collect.cfg configuration file\n";
86	&print_usage($plugin_name);
87	die "\n";
88	}
89
90	return ($plugin_name,$generate_format, $kea_arg);
91	}
92
93	sub new {
94	my $class = shift (@_);
95	if ($class eq "ConvertToPlug") {$class = shift (@_);}
96	my $self;
97	# parsargv::parse might modify the list, so we do this by creating a copy
98	# of the argument list.
99	my @arglist = @_;
100	my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_);
101
102	if ($class eq "PDFPlug" && $generate_format eq "text" &&
103	$ENV{'GSDLOS'} =~ /^windows$/i) {
104	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
105	$generate_format = "html";
106	}
107
108	if ($generate_format eq "text")
109	{
110	$self = new TEXTPlug ($class, @arglist);
111	$self->{'convert_to'} = "TEXT";
112	$self->{'convert_to_ext'} = "txt";
113	}
114	else
115	{
116	$self = new HTMLPlug ($class, @arglist);
117	$self->{'convert_to'} = "HTML";
118	$self->{'convert_to_ext'} = "html";
119
120	$self->{'rename_assoc_files'} = 1;
121	$self->{'metadata_fields'} .= ",GENERATOR";
122	}
123
124	#if kea data to be extracted...
125	$self->{'kea'} = 1 if($kea_arg->{'kea'});
126	$self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
127
128	return bless $self, $class;
129	}
130
131
132
133	# Run conversion utility on the input file.
134	#
135	# The conversion takes place in a collection specific 'tmp' directory so
136	# that we don't accidentally damage the input.
137	#
138	# The desired output type is indicated by $output_ext. This is usually
139	# something like "html" or "word", but can be "best" (or the empty string)
140	# to indicate that the conversion utility should do the best it can.
141
142	sub tmp_area_convert_file {
143	my $self = shift (@_);
144	my ($output_ext, $input_filename, $textref) = @_;
145
146	my $outhandle = $self->{'outhandle'};
147	my $convert_to = $self->{'convert_to'};
148
149	# softlink to collection tmp dir
150	my $tmp_dirname
151	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
152	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
153
154	# derive tmp filename from input filename
155	my ($tailname, $dirname, $suffix)
156	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
157
158	# Remove any white space from filename -- no risk of name collision, and
159	# makes later conversion by utils simpler. Leave spaces in path...
160	$tailname =~ s/\s+//g;
161
162	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
163
164	&util::soft_link($input_filename, $tmp_filename);
165
166	my $verbosity = $self->{'verbosity'};
167	if ($verbosity > 0) {
168	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
169	}
170
171	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
172
173	# Execute the conversion command and get the type of the result,
174	# making sure the converter gives us the appropriate output type
175	my $output_type = lc($convert_to);
176	my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
177	$output_type = `$cmd`;
178
179	# remove symbolic link to original file
180	&util::rm($tmp_filename);
181
182	# Check STDERR here
183	chomp $output_type;
184	if ($output_type eq "fail") {
185	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
186	if ($self->{'faillogname'} ne "" && -s "$errlog") {
187	open(SAVELOG, ">>$self->{'faillogname'}");
188	open(ERRLOG, "$errlog");
189	print SAVELOG "$tailname$suffix (converting to $convert_to) failed:\n";
190	while (<ERRLOG>) {
191	print SAVELOG "$_";
192	}
193	close ERRLOG;
194	print SAVELOG "\n";
195	close SAVELOG;
196	}
197	&util::rm("$errlog") if (-e "$errlog");
198	return "";
199	}
200
201	# store the actual output type and return the output filename
202	$self->{'convert_to_ext'} = $output_type;
203	my $output_filename = $tmp_filename;
204
205	$output_filename =~ s/$suffix$/.$output_type/;
206
207	return $output_filename;
208	}
209
210
211	# Remove collection specific tmp directory and all its contents.
212
213	sub cleanup_tmp_area {
214	my $self = shift (@_);
215
216	my $tmp_dirname
217	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
218	&util::rm_r($tmp_dirname);
219	&util::mk_dir($tmp_dirname);
220	}
221
222
223
224
225	# Override BasPlug read
226	# We don't want to get language encoding stuff until after we've converted
227	# our file to either TEXT or HTML.
228	sub read {
229	my $self = shift (@_);
230	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
231	# if ($self->is_recursive()) {
232	# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
233	# }
234
235	my $outhandle = $self->{'outhandle'};
236
237	my $filename = &util::filename_cat($base_dir, $file);
238	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
239	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
240	return undef;
241	}
242	my $plugin_name = ref ($self);
243	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
244
245	# read in file ($text will be in utf8)
246	my $text = "";
247
248	my $output_ext = $self->{'convert_to_ext'};
249	my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
250
251	if ("$conv_filename" eq "") {return 0;} # allows continue on errors
252	if (! -e "$conv_filename") {return 0;} # allows continue on errors
253	$self->{'conv_filename'} = $conv_filename;
254
255	# Do encoding stuff
256	my ($language, $encoding);
257	if ($self->{'input_encoding'} eq "auto") {
258	# use textcat to automatically work out the input encoding and language
259	($language, $encoding) = $self->get_language_encoding ($conv_filename);
260	} elsif ($self->{'extract_language'}) {
261	# use textcat to get language metadata
262
263	my ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename);
264	$encoding = $self->{'input_encoding'};
265	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
266	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
267	print $outhandle "appears to be encoded as $extracted_encoding.\n";
268	}
269	} else {
270	$language = $self->{'default_language'};
271	$encoding = $self->{'input_encoding'};
272	}
273
274	&BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
275	if (!length ($text)) {
276	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
277	return 0;
278	}
279
280	# create a new document
281	my $doc_obj = new doc ($conv_filename, "indexed_doc");
282	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
283	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
284	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
285	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($file));
286
287
288	# include any metadata passed in from previous plugins
289	# note that this metadata is associated with the top level section
290	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
291	# do plugin specific processing of doc_obj
292	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
293	# do any automatic metadata extraction
294	$self->auto_extract_metadata ($doc_obj);
295	# add an OID
296	$doc_obj->set_OID();
297	# process the document
298	$processor->process($doc_obj);
299	$self->cleanup_tmp_area();
300
301
302	return 1;
303	}
304
305
306	# do plugin specific processing of doc_obj for HTML type
307	sub process_type {
308	my $self = shift (@_);
309	my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
310
311	my $conv_filename = $self->{'conv_filename'};
312	my $tmp_dirname = File::Basename::dirname($conv_filename);
313	my $tmp_tailname = File::Basename::basename($conv_filename);
314
315	my $convert_to = $self->{'convert_to'};
316	my $ret_val;
317
318	if ($convert_to eq "TEXT")
319	{
320
321	$ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
322	$tmp_dirname, $tmp_tailname,
323	$metadata, $doc_obj);
324	}
325	else
326	{
327	$ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
328	$tmp_dirname, $tmp_tailname,
329	$metadata, $doc_obj);
330	}
331
332	# associate original file with doc object
333	my $cursection = $doc_obj->get_top_section();
334	my $filename = &util::filename_cat($base_dir, $file);
335	$doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
336
337	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext>";
338	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
339	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
340	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
341	return $ret_val;
342	}
343
344	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: