Context Navigation

source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 3350

Last change on this file since 3350 was 3350, checked in by sjboddie, 22 years ago
Added -use_strings option to ConvertToPlug. The default behaviour for plugins derived from ConvertToPlug (WordPlug, PDFPlug etc) is now to exclude documents that can't be converted correctly. They won't use the perl strings stuff to extract text unless the -use_strings option is specified.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	###########################################################################
2	#
3	# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4	# on plugin argument convert_to
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29	# It facilitates the conversion of these document types to either HTML
30	# or TEXT by setting up variable that instruct ConvertToBasPlug
31	# how to work.
32
33	# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34	# the plugin argument 'convert_to'. If the argument is not present,
35	# the default is to inherit HTMLPlug.
36
37
38	package ConvertToPlug;
39
40	use BasPlug;
41	use HTMLPlug;
42	use TEXTPlug;
43	use ghtml;
44
45	sub BEGIN {
46	@ISA = ('HTMLPlug');
47	# @ISA = ('HTMLPlug', 'TEXTPlug');
48	# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49	}
50
51	sub print_usage {
52	my ($plugin_name) = @_;
53
54	# for when this function is called directly by pluginfo.pl
55	if (ref ($plugin_name)) {
56	$plugin_name = ref ($plugin_name);
57	}
58
59	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
60	print STDERR " options:\n";
61	print STDERR " -convert_to (html\|text) plugin converts to TEXT or HTML\n";
62	print STDERR " (default html)\n";
63	print STDERR " -use_strings if set a simple strings function\n";
64	print STDERR " will be called to extract text\n";
65	print STDERR " if the conversion utility fails\n";
66	}
67
68	sub parse_args
69	{
70	my $class = shift (@_);
71	my ($args) = @_;
72
73	my $plugin_name = $class;
74	$plugin_name =~ s/\.pm$//;
75
76	my $newargs = {};
77
78	if (!parsargv::parse($args,
79	q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
80	q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
81	q^convert_to/(html\|text)/html^, \$newargs->{'generate_format'},
82	q^use_strings^, \$newargs->{'use_strings'},
83	"allow_extra_options")) {
84
85	print STDERR "\nIncorrect options passed to $plugin_name, ";
86	print STDERR "check your collect.cfg configuration file\n";
87	&print_usage($plugin_name);
88	die "\n";
89	}
90
91	return ($plugin_name, $newargs);
92	}
93
94	sub new {
95	my $class = shift (@_);
96	if ($class eq "ConvertToPlug") {$class = shift (@_);}
97	my $self;
98	# parsargv::parse might modify the list, so we do this by creating a copy
99	# of the argument list.
100	my @arglist = @_;
101	my ($plugin_name, $args) = $class->parse_args(\@_);
102
103	if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
104	$ENV{'GSDLOS'} =~ /^windows$/i) {
105	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
106	$args->{'generate_format'} = "html";
107	}
108
109	if ($args->{'generate_format'} eq "text")
110	{
111	$self = new TEXTPlug ($class, @arglist);
112	$self->{'convert_to'} = "TEXT";
113	$self->{'convert_to_ext'} = "txt";
114	}
115	else
116	{
117	$self = new HTMLPlug ($class, @arglist);
118	$self->{'convert_to'} = "HTML";
119	$self->{'convert_to_ext'} = "html";
120
121	$self->{'rename_assoc_files'} = 1;
122	$self->{'metadata_fields'} .= ",GENERATOR";
123	}
124
125	foreach my $key (keys %$args) {
126	$self->{$key} = $args->{$key};
127	}
128
129	return bless $self, $class;
130	}
131
132
133
134	# Run conversion utility on the input file.
135	#
136	# The conversion takes place in a collection specific 'tmp' directory so
137	# that we don't accidentally damage the input.
138	#
139	# The desired output type is indicated by $output_ext. This is usually
140	# something like "html" or "word", but can be "best" (or the empty string)
141	# to indicate that the conversion utility should do the best it can.
142
143	sub tmp_area_convert_file {
144	my $self = shift (@_);
145	my ($output_ext, $input_filename, $textref) = @_;
146
147	my $outhandle = $self->{'outhandle'};
148	my $convert_to = $self->{'convert_to'};
149	my $failhandle = $self->{'failhandle'};
150
151	# softlink to collection tmp dir
152	my $tmp_dirname
153	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
154	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
155
156	# derive tmp filename from input filename
157	my ($tailname, $dirname, $suffix)
158	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
159
160	# Remove any white space from filename -- no risk of name collision, and
161	# makes later conversion by utils simpler. Leave spaces in path...
162	$tailname =~ s/\s+//g;
163
164	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
165
166	&util::soft_link($input_filename, $tmp_filename);
167
168	my $verbosity = $self->{'verbosity'};
169	if ($verbosity > 0) {
170	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
171	}
172
173	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
174
175	# Execute the conversion command and get the type of the result,
176	# making sure the converter gives us the appropriate output type
177	my $output_type = lc($convert_to);
178	my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
179	if ($self->{'use_strings'}) {
180	$cmd .= "-use_strings ";
181	}
182	$cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
183	$output_type = `$cmd`;
184
185	# remove symbolic link to original file
186	&util::rm($tmp_filename);
187
188	# Check STDERR here
189	chomp $output_type;
190	if ($output_type eq "fail") {
191	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
192	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
193	$self->{'num_not_processed'} ++;
194	if (-s "$errlog") {
195	open(ERRLOG, "$errlog");
196	while (<ERRLOG>) {
197	print $outhandle "$_";
198	}
199	print $outhandle "\n";
200	close ERRLOG;
201	}
202	&util::rm("$errlog") if (-e "$errlog");
203	return "";
204	}
205
206	# store the actual output type and return the output filename
207	# it's possible we requested conversion to html, but only to text succeeded
208
209	$self->{'convert_to_ext'} = $output_type;
210	if ($output_type =~ /html/i) {
211	$self->{'converted_to'} = "HTML";
212	} elsif ($output_type =~ /te?xt/i) {
213	$self->{'converted_to'} = "TEXT";
214	}
215	my $output_filename = $tmp_filename;
216
217	$output_filename =~ s/$suffix$/.$output_type/;
218
219	return $output_filename;
220	}
221
222
223	# Remove collection specific tmp directory and all its contents.
224
225	sub cleanup_tmp_area {
226	my $self = shift (@_);
227
228	my $tmp_dirname
229	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
230	&util::rm_r($tmp_dirname);
231	&util::mk_dir($tmp_dirname);
232	}
233
234
235
236
237	# Override BasPlug read
238	# We don't want to get language encoding stuff until after we've converted
239	# our file to either TEXT or HTML.
240	sub read {
241	my $self = shift (@_);
242	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
243	# if ($self->is_recursive()) {
244	# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
245	# }
246
247	my $outhandle = $self->{'outhandle'};
248
249	my $filename = $file;
250	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
251
252	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
253	$self->{'num_blocked'} ++;
254	return 0;
255	}
256	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
257	return undef;
258	}
259	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
260
261	# read in file ($text will be in utf8)
262	my $text = "";
263
264	my $output_ext = $self->{'convert_to_ext'};
265	my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
266
267	if ("$conv_filename" eq "") {return 0;} # allows continue on errors
268	if (! -e "$conv_filename") {return 0;} # allows continue on errors
269	$self->{'conv_filename'} = $conv_filename;
270
271	# Do encoding stuff
272	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
273
274	&BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
275	if (!length ($text)) {
276	my $plugin_name = ref ($self);
277	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
278	return 0;
279	}
280
281	# if we converted to HTML, convert é and etc to utf-8.
282	# this should really happen before language_extraction, but that means
283	# modifying a file on disk...
284	$text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
285
286	# create a new document
287	my $doc_obj = new doc ($conv_filename, "indexed_doc");
288	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
289	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
290	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
291	my ($filemeta) = $file =~ /([^\\\/]+)$/;
292	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
293	if ($self->{'cover_image'}) {
294	$self->associate_cover_image($doc_obj, $filename);
295	}
296
297	# include any metadata passed in from previous plugins
298	# note that this metadata is associated with the top level section
299	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
300	# do plugin specific processing of doc_obj
301	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
302	# do any automatic metadata extraction
303	$self->auto_extract_metadata ($doc_obj);
304	# add an OID
305	$doc_obj->set_OID();
306	# process the document
307	$processor->process($doc_obj);
308	$self->cleanup_tmp_area();
309
310	$self->{'num_processed'} ++;
311
312	return 1;
313	}
314
315
316	# do plugin specific processing of doc_obj for HTML type
317	sub process_type {
318	my $self = shift (@_);
319	my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
320
321	my $conv_filename = $self->{'conv_filename'};
322	my $tmp_dirname = File::Basename::dirname($conv_filename);
323	my $tmp_tailname = File::Basename::basename($conv_filename);
324
325	my $converted_to = $self->{'converted_to'};
326	my $ret_val;
327
328	if ($converted_to eq "TEXT")
329	{
330
331	$ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
332	$tmp_dirname, $tmp_tailname,
333	$metadata, $doc_obj);
334	}
335	else
336	{
337	$ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
338	$tmp_dirname, $tmp_tailname,
339	$metadata, $doc_obj);
340	}
341
342	# associate original file with doc object
343	my $cursection = $doc_obj->get_top_section();
344	my $filename = &util::filename_cat($base_dir, $file);
345	$doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
346
347	my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
348	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
349	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
350	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
351	return $ret_val;
352	}
353
354	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: