source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 10405

Last change on this file since 10405 was 10352, checked in by chi, 19 years ago

Change the pagedimg_png,jpg,gif (hyphen to underscore) setting in -convert_to

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from BasPlug
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug, RTFPlug
28# and PDFPlug. It facilitates the conversion of these document types to either
29# HTML, Text or auto (allow user to choose which format to convert to).
30# It works by dynamically inheriting BasPlug and base on the plugin type in
31# secondary_plugins to devide which format to 'convert_to'. If the argument is
32# not present, the default is to inherit auto.
33package ConvertToPlug;
34
35use BasPlug;
36use ghtml;
37use HTMLPlug;
38use TEXTPlug;
39use PagedImgPlug;
40
41#use strict;
42#no strict 'refs'; # allow filehandles to be variables and viceversa
43
44sub BEGIN {
45 @ConvertToPlug::ISA = ('BasPlug');
46}
47
48my $convert_to_list =
49 [ { 'name' => "auto",
50 'desc' => "{ConvertToPlug.convert_to.auto}" },
51 { 'name' => "html",
52 'desc' => "{ConvertToPlug.convert_to.html}" },
53 { 'name' => "text",
54 'desc' => "{ConvertToPlug.convert_to.text}" },
55 { 'name' => "pagedimg_jpg",
56 'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}"},
57 { 'name' => "pagedimg_gif",
58 'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}"},
59 { 'name' => "pagedimg_png",
60 'desc' => "{ConvertToPlug.convert_to.pagedimg_png}"},
61 ];
62
63my $arguments =
64 [ { 'name' => "convert_to",
65 'desc' => "{ConvertToPlug.convert_to}",
66 'type' => "enum",
67 'reqd' => "yes",
68 'list' => $convert_to_list,
69 'deft' => "html" },
70 { 'name' => "title_sub",
71 'desc' => "{HTMLPlug.title_sub}",
72 'type' => "string",
73 #'type' => "regexp",
74 'deft' => "" },
75 { 'name' => "use_strings",
76 'desc' => "{ConvertToPlug.use_strings}",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "extract_keyphrases",
80 'desc' => "{BasPlug.extract_keyphrases}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes" },
84 { 'name' => "extract_keyphrase_options",
85 'desc' => "{BasPlug.extract_keyphrase_options}",
86 'type' => "string",
87 'reqd' => "no",
88 'hiddengli' => "yes" } ];
89
90my $options = { 'name' => "ConvertToPlug",
91 'desc' => "{ConvertToPlug.desc}",
92 'abstract' => "yes",
93 'inherits' => "yes",
94 'args' => $arguments };
95
96sub findType
97{
98 my ($inputargs) = @_;
99
100 for(my $intCounter = 0; $intCounter < scalar(@{$inputargs}) ; $intCounter++)
101 {
102 if($inputargs->[$intCounter] eq "-convert_to")
103 {
104 #if($inputargs->[$intCounter+1] eq "auto" || $inputargs->[$intCounter+1] =~ /pagedimg.*/i || $inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html")
105 # if the setting is "auto" then refer to html for now
106 if($inputargs->[$intCounter+1] =~ /pagedimg.*/i || $inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html")
107 {
108 return $inputargs->[$intCounter+1];
109 }
110 else {return "html";}
111 }
112 }
113 return "html";
114}
115
116sub load_secondary_plugins
117{
118 my $self = shift (@_);
119 my ($class,$plugin_options) = @_;
120
121 my @convert_to_list = split(",",$self->{'convert_to'});
122 $secondary_plugins = {};
123
124 foreach my $convert_to (@convert_to_list) {
125 # load in "convert_to" plugin package
126 my $plugin_class = $convert_to."Plug";
127 my $plugin_package = $plugin_class.".pm";
128
129 require $plugin_package;
130
131 # call its constructor with extra options that we've worked out!
132 my $arglist = $plugin_options->{$plugin_class};
133 my $secondary_plugin = new $plugin_class([], \@$arglist);
134 $secondary_plugins->{$plugin_class} = $secondary_plugin;
135 }
136 $self->{'secondary_plugins'} = $secondary_plugins;
137}
138
139sub new {
140 my ($class) = shift (@_);
141 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
142 push(@$pluginlist, $class);
143 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
144 my $strConvertTo = findType($inputargs);
145
146 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
147 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
148
149 if ($classPluginName eq "PDFPlug" && $strConvertTo eq "text" &&
150 $ENV{'GSDLOS'} =~ /^windows$/i) {
151 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
152 $strConvertTo = "html";
153 }
154
155 my $self = {};
156
157 my $plugin_class ="";
158
159 if ($strConvertTo =~ /pagedimg.*/i){
160 $plugin_class = "PagedImg"."Plug";
161 } else {
162 $plugin_class = uc($strConvertTo)."Plug";
163 }
164
165 my $plugin_package = $plugin_class.".pm";
166 require $plugin_package;
167
168 # call its constructor with extra options that we've worked out!
169 my $arglist = $plugin_options->{$plugin_class};
170 $self = (defined $hashArgOptLists)? new $plugin_class($pluginlist,$inputargs,$hashArgOptLists): new $plugin_class($pluginlist,$inputargs);
171
172
173 if ($strConvertTo eq "text")
174 {
175 #$self = (defined $hashArgOptLists)? new TEXTPlug($pluginlist,$inputargs,$hashArgOptLists): new TEXTPlug($pluginlist,$inputargs);
176 $self->{'convert_to'} = "TEXT";
177 $self->{'convert_to_ext'} = "txt";
178 my $text_options = [];
179 push(@$text_options,"-metadata_fields","Title,GENERATOR");
180 $secondary_plugin_options->{'TextPlug'} = $text_options;
181 }
182 elsif ($strConvertTo =~ /pagedimg.*/i){
183 #$self = (defined $hashArgOptLists)? new PagedImgPlug($pluginlist,$inputargs,$hashArgOptLists): new PagedImgPlug($pluginlist,$inputargs);
184 $self->{'convert_to'} = "PagedImg";
185 my $convert_to_ext = $strConvertTo;
186 $convert_to_ext =~ s/.*\_(.*)/$1/i;
187 if ($convert_to_ext eq "gif"){
188 $self->{'convert_to_ext'} = "gif";
189 } elsif ($convert_to_ext eq "jpg"){
190 $self->{'convert_to_ext'} = "jpg";
191 } elsif ($convert_to_ext eq "png") {
192 $self->{'convert_to_ext'} = "png";
193 }
194 my $pagedimg_options = [];
195 push(@$pagedimg_options,"-metadata_fields","Title,GENERATOR");
196 $secondary_plugin_options->{'PagedImgPlug'} = $pagedimg_options;
197 } else {
198 # HTML or auto
199 #$self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
200 $self->{'convert_to'} = "HTML";
201 $self->{'convert_to_ext'} = "html";
202 my $html_options = [];
203 push(@$html_options,"-rename_assoc_files","1");
204 push(@$html_options,"-metadata_fields","Title,GENERATOR");
205 $secondary_plugin_options->{'HTMLPlug'} = $html_options;
206 }
207
208 return bless $self, $class;
209}
210
211
212sub init {
213 my $self = shift (@_);
214 my ($verbosity, $outhandle, $failhandle) = @_;
215
216 $self->SUPER::init($verbosity,$outhandle,$failhandle);
217
218 my $secondary_plugins = $self->{'secondary_plugins'};
219
220 foreach my $plug_name (keys %$secondary_plugins) {
221 my $plugin = $secondary_plugins->{$plug_name};
222 $plugin->init($verbosity,$outhandle,$failhandle);
223 }
224}
225
226sub deinit {
227 # called only once, after all plugin passes have been done
228
229 my ($self) = @_;
230
231 my $secondary_plugins = $self->{'secondary_plugins'};
232
233 foreach my $plug_name (keys %$secondary_plugins) {
234 my $plugin = $secondary_plugins->{$plug_name};
235 $plugin->deinit();
236 }
237}
238
239sub convert_post_process
240{
241 # by default do no post processing
242 return;
243}
244
245
246# Run conversion utility on the input file.
247#
248# The conversion takes place in a collection specific 'tmp' directory so
249# that we don't accidentally damage the input.
250#
251# The desired output type is indicated by $output_ext. This is usually
252# something like "html" or "word", but can be "best" (or the empty string)
253# to indicate that the conversion utility should do the best it can.
254
255sub tmp_area_convert_file {
256 my $self = shift (@_);
257 my ($output_ext, $input_filename, $textref) = @_;
258
259 my $outhandle = $self->{'outhandle'};
260 my $convert_to = $self->{'convert_to'};
261 my $failhandle = $self->{'failhandle'};
262 my $convert_to_ext = $self->{'convert_to_ext'};
263
264 # softlink to collection tmp dir
265 my $tmp_dirname
266 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
267 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
268
269 # derive tmp filename from input filename
270 my ($tailname, $dirname, $suffix)
271 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
272
273 # Remove any white space from filename -- no risk of name collision, and
274 # makes later conversion by utils simpler. Leave spaces in path...
275 $tailname =~ s/\s+//g;
276
277 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
278
279 &util::soft_link($input_filename, $tmp_filename);
280
281 my $verbosity = $self->{'verbosity'};
282 if ($verbosity > 0) {
283 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
284 }
285
286 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
287
288 # Execute the conversion command and get the type of the result,
289 # making sure the converter gives us the appropriate output type
290 my $output_type="";
291 if ($convert_to =~ m/PagedImg/i) {
292 $output_type = lc($convert_to)."_".lc($convert_to_ext);
293 } else {
294 $output_type = lc($convert_to);
295 }
296
297 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
298 if (defined $self->{'convert_options'}) {
299 $cmd .= $self->{'convert_options'} . " ";
300 }
301 if ($self->{'use_strings'}) {
302 $cmd .= "-use_strings ";
303 }
304 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
305
306 $output_type = `$cmd`;
307
308 # remove symbolic link to original file
309 &util::rm($tmp_filename);
310
311 # Check STDERR here
312 chomp $output_type;
313 if ($output_type eq "fail") {
314 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
315 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
316 $self->{'num_not_processed'} ++;
317 if (-s "$errlog") {
318 open(ERRLOG, "$errlog");
319 while (<ERRLOG>) {
320 print $outhandle "$_";
321 }
322 print $outhandle "\n";
323 close ERRLOG;
324 }
325 &util::rm("$errlog") if (-e "$errlog");
326 return "";
327 }
328
329 # store the *actual* output type and return the output filename
330 # it's possible we requested conversion to html, but only to text succeeded
331 #$self->{'convert_to_ext'} = $output_type;
332 if ($output_type =~ /html/i) {
333 $self->{'converted_to'} = "HTML";
334 } elsif ($output_type =~ /te?xt/i) {
335 $self->{'converted_to'} = "TEXT";
336 } elsif ($output_type =~ /item/i){
337 $self->{'converted_to'} = "PagedImg";
338 }
339
340 my $output_filename = $tmp_filename;
341 if ($output_type =~ /item/i) {
342 # running under windows
343 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
344 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
345 } else {
346 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
347 }
348 } else {
349 $output_filename =~ s/$suffix$/.$output_type/;
350 }
351 return $output_filename;
352}
353
354
355# Remove collection specific tmp directory and all its contents.
356sub cleanup_tmp_area {
357 my $self = shift (@_);
358
359 my $tmp_dirname
360 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
361 &util::rm_r($tmp_dirname);
362 &util::mk_dir($tmp_dirname);
363}
364
365# Override BasPlug read
366# We don't want to get language encoding stuff until after we've converted
367# our file to either TEXT or HTML or PagedImage.
368sub read {
369 my $self = shift (@_);
370 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
371 #if ($self->is_recursive()) {
372 # die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
373 # }
374
375 my $outhandle = $self->{'outhandle'};
376
377 my ($block_status,$filename) = $self->read_block(@_);
378 return $block_status if ((!defined $block_status) || ($block_status==0));
379 $file = $self->read_tidy_file($file);
380
381 my $output_ext = $self->{'convert_to_ext'};
382
383 my $conv_filename = "";
384 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
385
386 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
387 if (! -e "$conv_filename") {return 0;} # allows continue on errors
388 $self->{'conv_filename'} = $conv_filename;
389
390 $self->convert_post_process($conv_filename);
391
392 my $secondary_plugins = $self->{'secondary_plugins'};
393 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
394
395 if ($num_secondary_plugins == 0) {
396 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
397 return 0; # effectively block it
398 }
399
400 my @plugin_names = keys %$secondary_plugins;
401 my $plugin_name = shift @plugin_names;
402
403 if ($num_secondary_plugins > 1) {
404 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
405 }
406
407 my $secondary_plugin = $secondary_plugins->{$plugin_name};
408
409 # note: metadata is not carried on to the next level
410 my ($rv,$doc_obj)
411 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
412 $metadata, $processor, $maxdocs, $total_count,
413 $gli);
414
415 if ((!defined $rv) || ($rv<1)) {
416 # wasn't processed
417 return $rv;
418 }
419
420 # Override previous gsdlsourcefilename set by secondary plugin
421 my $collect_file = &util::filename_within_collection($filename);
422 my $collect_conv_file = &util::filename_within_collection($conv_filename);
423 $doc_obj->set_source_filename ($collect_file);
424 $doc_obj->set_converted_filename($collect_conv_file);
425
426 my ($filemeta) = $file =~ /([^\\\/]+)$/;
427 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
428
429 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
430 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
431
432 # do plugin specific processing of doc_obj
433 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
434 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
435 return -1;
436 }
437 # do any automatic metadata extraction
438 $self->auto_extract_metadata ($doc_obj);
439 # add an OID
440 $doc_obj->set_OID();
441 # process the document
442 $processor->process($doc_obj);
443 ##$self->cleanup_tmp_area();
444
445 $self->{'num_processed'} ++;
446
447 return 1;
448}
449
450
451# do plugin specific processing of doc_obj for doc_ext type
452sub process_type {
453 my $self = shift (@_);
454 my ($doc_ext, $base_dir, $file, $doc_obj) = @_;
455
456 # associate original file with doc object
457 my $cursection = $doc_obj->get_top_section();
458 my $filename = &util::filename_cat($base_dir, $file);
459 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
460
461 my $file_type;
462
463 if ($doc_ext eq "doc") {
464 $file_type = "Word";
465 } elsif ($doc_ext eq "xls") {
466 $file_type = "Excel";
467 } elsif ($doc_ext eq "ppt") {
468 $file_type = "PPT";
469 } elsif ($doc_ext eq "pdf") {
470 $file_type = "PDF";
471 } elsif ($doc_ext eq "rtf") {
472 $file_type = "RTF";
473 } elsif ($doc_ext eq "ps") {
474 $file_type = "PS";
475 }
476
477 my $file_format = $file_type || "unknown";
478
479 # We use set instead of add here because we only want one value
480 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
481
482 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
483 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
484 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
485 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
486
487 return 1;
488}
489
4901;
491
492
493
494
495
496
497
Note: See TracBrowser for help on using the repository browser.