Context Navigation

source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

Property svn:keywords set to Author Date Id Revision

File size: 11.3 KB

Line
1	###########################################################################
2	#
3	# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4	# on plugin argument convert_to
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29	# It facilitates the conversion of these document types to either HTML
30	# or TEXT by setting up variable that instruct ConvertToBasPlug
31	# how to work.
32
33	# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34	# the plugin argument 'convert_to'. If the argument is not present,
35	# the default is to inherit HTMLPlug.
36
37
38	package ConvertToPlug;
39
40	use BasPlug;
41	use HTMLPlug;
42	use TEXTPlug;
43	use ghtml;
44
45	sub BEGIN {
46	@ISA = ('HTMLPlug');
47	# @ISA = ('HTMLPlug', 'TEXTPlug');
48	# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49	}
50
51	sub print_usage {
52	my ($plugin_name) = @_;
53
54	# for when this function is called directly by pluginfo.pl
55	if (ref ($plugin_name)) {
56	$plugin_name = ref ($plugin_name);
57	}
58
59	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
60	print STDERR " options:\n";
61	print STDERR " -convert_to (html\|text) plugin converts to TEXT or HTML\n";
62	print STDERR " (default html)\n";
63	}
64
65	sub parse_args
66	{
67	my $class = shift (@_);
68	my ($args) = @_;
69
70	my $plugin_name = $class;
71	$plugin_name =~ s/\.pm$//;
72
73	my $generate_format;
74	my $kea_arg;
75
76	if (!parsargv::parse($args,
77	q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
78	q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
79	q^convert_to/(html\|text)/html^, \$generate_format,
80	"allow_extra_options")) {
81
82	print STDERR "\nIncorrect options passed to $plugin_name, ";
83	print STDERR "check your collect.cfg configuration file\n";
84	&print_usage($plugin_name);
85	die "\n";
86	}
87
88	return ($plugin_name,$generate_format, $kea_arg);
89	}
90
91	sub new {
92	my $class = shift (@_);
93	if ($class eq "ConvertToPlug") {$class = shift (@_);}
94	my $self;
95	# parsargv::parse might modify the list, so we do this by creating a copy
96	# of the argument list.
97	my @arglist = @_;
98	my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_);
99
100	if ($class eq "PDFPlug" && $generate_format eq "text" &&
101	$ENV{'GSDLOS'} =~ /^windows$/i) {
102	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
103	$generate_format = "html";
104	}
105
106	if ($generate_format eq "text")
107	{
108	$self = new TEXTPlug ($class, @arglist);
109	$self->{'convert_to'} = "TEXT";
110	$self->{'convert_to_ext'} = "txt";
111	}
112	else
113	{
114	$self = new HTMLPlug ($class, @arglist);
115	$self->{'convert_to'} = "HTML";
116	$self->{'convert_to_ext'} = "html";
117
118	$self->{'rename_assoc_files'} = 1;
119	$self->{'metadata_fields'} .= ",GENERATOR";
120	}
121
122	#if kea data to be extracted...
123	$self->{'kea'} = 1 if($kea_arg->{'kea'});
124	$self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
125
126	return bless $self, $class;
127	}
128
129
130
131	# Run conversion utility on the input file.
132	#
133	# The conversion takes place in a collection specific 'tmp' directory so
134	# that we don't accidentally damage the input.
135	#
136	# The desired output type is indicated by $output_ext. This is usually
137	# something like "html" or "word", but can be "best" (or the empty string)
138	# to indicate that the conversion utility should do the best it can.
139
140	sub tmp_area_convert_file {
141	my $self = shift (@_);
142	my ($output_ext, $input_filename, $textref) = @_;
143
144	my $outhandle = $self->{'outhandle'};
145	my $convert_to = $self->{'convert_to'};
146	my $failhandle = $self->{'failhandle'};
147
148	# softlink to collection tmp dir
149	my $tmp_dirname
150	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
151	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
152
153	# derive tmp filename from input filename
154	my ($tailname, $dirname, $suffix)
155	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156
157	# Remove any white space from filename -- no risk of name collision, and
158	# makes later conversion by utils simpler. Leave spaces in path...
159	$tailname =~ s/\s+//g;
160
161	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
162
163	&util::soft_link($input_filename, $tmp_filename);
164
165	my $verbosity = $self->{'verbosity'};
166	if ($verbosity > 0) {
167	print $outhandle "Converting $tailname$suffix to $convert_to format\n";
168	}
169
170	my $errlog = &util::filename_cat($tmp_dirname, "err.log");
171
172	# Execute the conversion command and get the type of the result,
173	# making sure the converter gives us the appropriate output type
174	my $output_type = lc($convert_to);
175	my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
176	$output_type = `$cmd`;
177
178	# remove symbolic link to original file
179	&util::rm($tmp_filename);
180
181	# Check STDERR here
182	chomp $output_type;
183	if ($output_type eq "fail") {
184	print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
185	print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
186	$self->{'num_not_processed'} ++;
187	if (-s "$errlog") {
188	open(ERRLOG, "$errlog");
189	while (<ERRLOG>) {
190	print $outhandle "$_";
191	}
192	print $outhandle "\n";
193	close ERRLOG;
194	}
195	&util::rm("$errlog") if (-e "$errlog");
196	return "";
197	}
198
199	# store the actual output type and return the output filename
200	$self->{'convert_to_ext'} = $output_type;
201	my $output_filename = $tmp_filename;
202
203	$output_filename =~ s/$suffix$/.$output_type/;
204
205	return $output_filename;
206	}
207
208
209	# Remove collection specific tmp directory and all its contents.
210
211	sub cleanup_tmp_area {
212	my $self = shift (@_);
213
214	my $tmp_dirname
215	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
216	&util::rm_r($tmp_dirname);
217	&util::mk_dir($tmp_dirname);
218	}
219
220
221
222
223	# Override BasPlug read
224	# We don't want to get language encoding stuff until after we've converted
225	# our file to either TEXT or HTML.
226	sub read {
227	my $self = shift (@_);
228	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
229	# if ($self->is_recursive()) {
230	# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
231	# }
232
233	my $outhandle = $self->{'outhandle'};
234
235	my $filename = &util::filename_cat($base_dir, $file);
236	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
237	$self->{'num_blocked'} ++;
238	return 0;
239	}
240	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
241	return undef;
242	}
243	my $plugin_name = ref ($self);
244	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
245
246	# read in file ($text will be in utf8)
247	my $text = "";
248
249	my $output_ext = $self->{'convert_to_ext'};
250	my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
251
252	if ("$conv_filename" eq "") {return 0;} # allows continue on errors
253	if (! -e "$conv_filename") {return 0;} # allows continue on errors
254	$self->{'conv_filename'} = $conv_filename;
255
256	# Do encoding stuff
257	my ($language, $encoding);
258	if ($self->{'input_encoding'} eq "auto") {
259	# use textcat to automatically work out the input encoding and language
260	($language, $encoding) = $self->get_language_encoding ($conv_filename);
261	} elsif ($self->{'extract_language'}) {
262	# use textcat to get language metadata
263
264	my ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename);
265	$encoding = $self->{'input_encoding'};
266	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
267	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
268	print $outhandle "appears to be encoded as $extracted_encoding.\n";
269	}
270	} else {
271	$language = $self->{'default_language'};
272	$encoding = $self->{'input_encoding'};
273	}
274
275	&BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
276	if (!length ($text)) {
277	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
278	return 0;
279	}
280
281	# create a new document
282	my $doc_obj = new doc ($conv_filename, "indexed_doc");
283	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
284	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
285	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
286	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($file));
287
288
289	# include any metadata passed in from previous plugins
290	# note that this metadata is associated with the top level section
291	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
292	# do plugin specific processing of doc_obj
293	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
294	# do any automatic metadata extraction
295	$self->auto_extract_metadata ($doc_obj);
296	# add an OID
297	$doc_obj->set_OID();
298	# process the document
299	$processor->process($doc_obj);
300	$self->cleanup_tmp_area();
301
302	$self->{'num_processed'} ++;
303
304	return 1;
305	}
306
307
308	# do plugin specific processing of doc_obj for HTML type
309	sub process_type {
310	my $self = shift (@_);
311	my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
312
313	my $conv_filename = $self->{'conv_filename'};
314	my $tmp_dirname = File::Basename::dirname($conv_filename);
315	my $tmp_tailname = File::Basename::basename($conv_filename);
316
317	my $convert_to = $self->{'convert_to'};
318	my $ret_val;
319
320	if ($convert_to eq "TEXT")
321	{
322
323	$ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
324	$tmp_dirname, $tmp_tailname,
325	$metadata, $doc_obj);
326	}
327	else
328	{
329	$ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
330	$tmp_dirname, $tmp_tailname,
331	$metadata, $doc_obj);
332	}
333
334	# associate original file with doc object
335	my $cursection = $doc_obj->get_top_section();
336	my $filename = &util::filename_cat($base_dir, $file);
337	$doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
338
339	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext>";
340	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
341	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
342	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
343	return $ret_val;
344	}
345
346	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: