source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 8915

Last change on this file since 8915 was 8893, checked in by davidb, 19 years ago

Additional check added to plugins read function to remain compatible
with BasPlug. The check is connected with supporting the -assoicate_ext
minus option implemented in BasPlug.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51my $convert_to_list =
52 [ { 'name' => "html",
53 'desc' => "{ConvertToPlug.convert_to.html}" },
54 { 'name' => "text",
55 'desc' => "{ConvertToPlug.convert_to.text}" } ];
56
57my $arguments =
58 [ { 'name' => "convert_to",
59 'desc' => "{ConvertToPlug.convert_to}",
60 'type' => "enum",
61 'reqd' => "yes",
62 'list' => $convert_to_list,
63 'deft' => "html" },
64 { 'name' => "use_strings",
65 'desc' => "{ConvertToPlug.use_strings}",
66 'type' => "flag",
67 'reqd' => "no" } ];
68
69my $options = { 'name' => "ConvertToPlug",
70 'desc' => "{ConvertToPlug.desc}",
71 'abstract' => "yes",
72 'inherits' => "yes",
73 'args' => $arguments };
74
75sub parse_args
76{
77 my $class = shift (@_);
78 my ($args) = @_;
79
80 my $plugin_name = $class;
81 $plugin_name =~ s/\.pm$//;
82
83 my $newargs = {};
84
85 if (!parsargv::parse($args,
86 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented)
87 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented)
88 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
89 q^use_strings^, \$newargs->{'use_strings'},
90 "allow_extra_options")) {
91
92 print STDERR "\nIncorrect options passed to $plugin_name, ";
93 print STDERR "check your collect.cfg configuration file\n";
94 $self->print_txt_usage(""); # Use default resource bundle
95 die "\n";
96 }
97
98 return ($plugin_name, $newargs);
99}
100
101sub new {
102 my $class = shift (@_);
103 if ($class eq "ConvertToPlug" && defined $_[0]) {$class = shift (@_);}
104 my $self;
105 # parsargv::parse might modify the list, so we do this by creating a copy
106 # of the argument list.
107 my @arglist = @_;
108 my ($plugin_name, $args) = $class->parse_args(\@_);
109
110 if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
111 $ENV{'GSDLOS'} =~ /^windows$/i) {
112 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
113 $args->{'generate_format'} = "html";
114 }
115
116 if ($args->{'generate_format'} eq "text")
117 {
118 $self = new TEXTPlug ($class, @arglist);
119 $self->{'convert_to'} = "TEXT";
120 $self->{'convert_to_ext'} = "txt";
121 }
122 else
123 {
124 $self = new HTMLPlug ($class, @arglist);
125 $self->{'convert_to'} = "HTML";
126 $self->{'convert_to_ext'} = "html";
127
128 $self->{'rename_assoc_files'} = 1;
129 $self->{'metadata_fields'} .= ",GENERATOR";
130 }
131
132 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
133 my $option_list = $self->{'option_list'};
134 push( @{$option_list}, $options );
135
136 foreach my $key (keys %$args) {
137 $self->{$key} = $args->{$key};
138 }
139
140 return bless $self, $class;
141}
142
143# Go straight to BasPlug and avoid the special case implemented by HTMLPlug
144sub metadata_read {
145 return BasPlug::metadata_read(@_);
146}
147
148# Run conversion utility on the input file.
149#
150# The conversion takes place in a collection specific 'tmp' directory so
151# that we don't accidentally damage the input.
152#
153# The desired output type is indicated by $output_ext. This is usually
154# something like "html" or "word", but can be "best" (or the empty string)
155# to indicate that the conversion utility should do the best it can.
156
157sub tmp_area_convert_file {
158 my $self = shift (@_);
159 my ($output_ext, $input_filename, $textref) = @_;
160
161 my $outhandle = $self->{'outhandle'};
162 my $convert_to = $self->{'convert_to'};
163 my $failhandle = $self->{'failhandle'};
164
165 # softlink to collection tmp dir
166 my $tmp_dirname
167 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
168 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
169
170 # derive tmp filename from input filename
171 my ($tailname, $dirname, $suffix)
172 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
173
174 # Remove any white space from filename -- no risk of name collision, and
175 # makes later conversion by utils simpler. Leave spaces in path...
176 $tailname =~ s/\s+//g;
177
178 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
179
180 &util::soft_link($input_filename, $tmp_filename);
181
182 my $verbosity = $self->{'verbosity'};
183 if ($verbosity > 0) {
184 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
185 }
186
187 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
188
189 # Execute the conversion command and get the type of the result,
190 # making sure the converter gives us the appropriate output type
191 my $output_type = lc($convert_to);
192 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
193 if (defined $self->{'convert_options'}) {
194 $cmd .= $self->{'convert_options'} . " ";
195 }
196 if ($self->{'use_strings'}) {
197 $cmd .= "-use_strings ";
198 }
199 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
200
201 $output_type = `$cmd`;
202
203 # remove symbolic link to original file
204 &util::rm($tmp_filename);
205
206 # Check STDERR here
207 chomp $output_type;
208 if ($output_type eq "fail") {
209 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
210 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
211 $self->{'num_not_processed'} ++;
212 if (-s "$errlog") {
213 open(ERRLOG, "$errlog");
214 while (<ERRLOG>) {
215 print $outhandle "$_";
216 }
217 print $outhandle "\n";
218 close ERRLOG;
219 }
220 &util::rm("$errlog") if (-e "$errlog");
221 return "";
222 }
223
224 # store the *actual* output type and return the output filename
225 # it's possible we requested conversion to html, but only to text succeeded
226
227 $self->{'convert_to_ext'} = $output_type;
228 if ($output_type =~ /html/i) {
229 $self->{'converted_to'} = "HTML";
230 } elsif ($output_type =~ /te?xt/i) {
231 $self->{'converted_to'} = "TEXT";
232 }
233 my $output_filename = $tmp_filename;
234
235 $output_filename =~ s/$suffix$/.$output_type/;
236
237 return $output_filename;
238}
239
240
241# Remove collection specific tmp directory and all its contents.
242
243sub cleanup_tmp_area {
244 my $self = shift (@_);
245
246 my $tmp_dirname
247 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
248 &util::rm_r($tmp_dirname);
249 &util::mk_dir($tmp_dirname);
250}
251
252
253
254
255# Override BasPlug read
256# We don't want to get language encoding stuff until after we've converted
257# our file to either TEXT or HTML.
258sub read {
259 my $self = shift (@_);
260 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
261# if ($self->is_recursive()) {
262# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
263# }
264
265 my $outhandle = $self->{'outhandle'};
266
267 my $filename = $file;
268 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
269
270 if ($self->associate_with($file,$filename,$metadata)) {
271 # a form of smart block
272 $self->{'num_blocked'} ++;
273 return 0; # blocked
274 }
275
276 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
277 $self->{'num_blocked'} ++;
278 return 0;
279 }
280 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
281 return undef;
282 }
283 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
284
285 # read in file ($text will be in utf8)
286 my $text = "";
287
288 my $output_ext = $self->{'convert_to_ext'};
289 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
290
291 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
292 if (! -e "$conv_filename") {return 0;} # allows continue on errors
293 $self->{'conv_filename'} = $conv_filename;
294
295 # Do encoding stuff
296 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
297
298 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
299 if (!length ($text)) {
300 my $plugin_name = ref ($self);
301 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
302 return 0;
303 }
304
305 # if we converted to HTML, convert &eacute; and etc to utf-8.
306 # this should really happen before language_extraction, but that means
307 # modifying a file on disk...
308 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
309
310 # create a new document
311 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
312 # now we use the original filename here
313 my $doc_obj = new doc($filename, "indexed_doc");
314 $doc_obj->set_converted_filename($conv_filename);
315 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
316 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
317 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
318 my ($filemeta) = $file =~ /([^\\\/]+)$/;
319 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
320 if ($self->{'cover_image'}) {
321 $self->associate_cover_image($doc_obj, $filename);
322 }
323 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
324 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
325
326 # include any metadata passed in from previous plugins
327 # note that this metadata is associated with the top level section
328 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
329 # do plugin specific processing of doc_obj
330 return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
331 # do any automatic metadata extraction
332 $self->auto_extract_metadata ($doc_obj);
333 # add an OID
334 $doc_obj->set_OID();
335 # process the document
336 $processor->process($doc_obj);
337 $self->cleanup_tmp_area();
338
339 $self->{'num_processed'} ++;
340
341 return 1;
342}
343
344
345# do plugin specific processing of doc_obj for HTML type
346sub process_type {
347 my $self = shift (@_);
348 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
349
350 my $conv_filename = $self->{'conv_filename'};
351 my $tmp_dirname = File::Basename::dirname($conv_filename);
352 my $tmp_tailname = File::Basename::basename($conv_filename);
353
354 my $converted_to = $self->{'converted_to'};
355 my $ret_val;
356
357 if ($converted_to eq "TEXT")
358 {
359
360 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
361 $tmp_dirname, $tmp_tailname,
362 $metadata, $doc_obj);
363 }
364 else
365 {
366 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
367 $tmp_dirname, $tmp_tailname,
368 $metadata, $doc_obj);
369 }
370
371 # associate original file with doc object
372 my $cursection = $doc_obj->get_top_section();
373 my $filename = &util::filename_cat($base_dir, $file);
374 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
375
376 my $file_type;
377
378 if ($doc_ext eq "doc") {
379 $file_type = "Word";
380 } elsif ($doc_ext eq "xls") {
381 $file_type = "Excel";
382 } elsif ($doc_ext eq "ppt") {
383 $file_type = "PPT";
384 } elsif ($doc_ext eq "pdf") {
385 $file_type = "PDF";
386 } elsif ($doc_ext eq "rtf") {
387 $file_type = "RTF";
388 } elsif ($doc_ext eq "ps") {
389 $file_type = "PS";
390 }
391
392 my $file_format = $file_type || "unknown";
393
394 # We use set instead of add here because we only want one value
395 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
396
397 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
398 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
399 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
400 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
401
402 return $ret_val;
403}
404
4051;
Note: See TracBrowser for help on using the repository browser.