source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 11008

Last change on this file since 11008 was 11008, checked in by mdewsnip, 18 years ago

Added an option to run the "fribidi" Unicode Bidirectional Algorithm program over the converted file, primarily to fix up right-to-left PDFs that pdftohtml can't handle.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from BasPlug
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug, RTFPlug
28# and PDFPlug. It facilitates the conversion of these document types to either
29# HTML, Text or auto (allow user to choose which format to convert to).
30# It works by dynamically inheriting BasPlug and base on the plugin type in
31# secondary_plugins to devide which format to 'convert_to'. If the argument is
32# not present, the default is to inherit auto.
33package ConvertToPlug;
34
35use BasPlug;
36use ghtml;
37use HTMLPlug;
38use TEXTPlug;
39use PagedImgPlug;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44sub BEGIN {
45 @ConvertToPlug::ISA = ('BasPlug');
46}
47
48my $convert_to_list =
49 [ { 'name' => "auto",
50 'desc' => "{ConvertToPlug.convert_to.auto}" },
51 { 'name' => "html",
52 'desc' => "{ConvertToPlug.convert_to.html}" },
53 { 'name' => "text",
54 'desc' => "{ConvertToPlug.convert_to.text}" }
55 ];
56
57my $arguments =
58 [ { 'name' => "convert_to",
59 'desc' => "{ConvertToPlug.convert_to}",
60 'type' => "enum",
61 'reqd' => "yes",
62 'list' => $convert_to_list,
63 'deft' => "auto" },
64 { 'name' => "title_sub",
65 'desc' => "{HTMLPlug.title_sub}",
66 'type' => "string",
67 #'type' => "regexp",
68 'deft' => "" },
69 { 'name' => "apply_fribidi",
70 'desc' => "{ConvertToPlug.apply_fribidi}",
71 'type' => "flag",
72 'reqd' => "no" },
73 { 'name' => "use_strings",
74 'desc' => "{ConvertToPlug.use_strings}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "extract_keyphrases",
78 'desc' => "{BasPlug.extract_keyphrases}",
79 'type' => "flag",
80 'reqd' => "no",
81 'hiddengli' => "yes" },
82 { 'name' => "extract_keyphrase_options",
83 'desc' => "{BasPlug.extract_keyphrase_options}",
84 'type' => "string",
85 'reqd' => "no",
86 'hiddengli' => "yes" } ];
87
88my $options = { 'name' => "ConvertToPlug",
89 'desc' => "{ConvertToPlug.desc}",
90 'abstract' => "yes",
91 'inherits' => "yes",
92 'args' => $arguments };
93
94
95sub load_secondary_plugins
96{
97 my $self = shift (@_);
98 my ($class,$input_args,$hashArgOptLists) = @_;
99
100 my @convert_to_list = split(",",$self->{'convert_to'});
101 my $secondary_plugins = {};
102
103 foreach my $convert_to (@convert_to_list) {
104 # load in "convert_to" plugin package
105 my $plugin_class = $convert_to."Plug";
106 my $plugin_package = $plugin_class.".pm";
107
108 require $plugin_package;
109
110 # call its constructor with extra options that we've worked out!
111 my $arglist = $input_args->{$plugin_class};
112# my $secondary_plugin = new $plugin_class([],$arglist, $hashArgOptLists);
113 # secondary plugins don';t need to know their calling plugin's options
114 my $secondary_plugin = new $plugin_class([],$arglist);
115 $secondary_plugins->{$plugin_class} = $secondary_plugin;
116 }
117 $self->{'secondary_plugins'} = $secondary_plugins;
118}
119
120sub new {
121 my ($class) = shift (@_);
122 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
123 push(@$pluginlist, $class);
124 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
125 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
126 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
127
128 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
129
130 my $convert_to_type = $self->{'convert_to'};
131 if (!defined $convert_to_type || $convert_to_type eq "") {
132 $convert_to_type = "auto";
133 }
134 my $windows_scripting = $self->{'windows_scripting'};
135 if ($classPluginName eq "PDFPlug") {
136 if ($convert_to_type eq "text" &&
137 $ENV{'GSDLOS'} =~ /^windows$/i) {
138 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
139 $convert_to_type = "html";
140 }
141 } elsif ($classPluginName eq "WordPlug") {
142 if (defined $self->{'windows_scripting'} && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) {
143 # we use structured HTML, not normal html
144 $convert_to_type = "structuredhtml";
145 }
146 } elsif ($classPluginName eq "PPTPlug") {
147 if (defined $self->{'windows_scripting'} && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
148 # we use paged img
149 $convert_to_type = "pagedimg_jpg";
150 }
151 } elsif ($classPluginName eq "PSPlug") {
152 if ($convert_to_type eq "auto") {
153 # we use text
154 $convert_to_type = "text";
155 }
156 }
157
158 if ($convert_to_type eq "auto") {
159 # choose html for now - should choose a format based on doc type
160 $convert_to_type = "html";
161 }
162
163 if ($convert_to_type eq "html") {
164 $self->{'convert_to'} = "HTML";
165 $self->{'convert_to_ext'} = "html";
166 } elsif ($convert_to_type eq "text") {
167 $self->{'convert_to'} = "TEXT";
168 $self->{'convert_to_ext'} = "txt";
169 } elsif ($convert_to_type eq "structuredhtml") {
170 $self->{'convert_to'} = "StructuredHTML";
171 $self->{'convert_to_ext'} = "html";
172 } elsif ($convert_to_type =~ /^pagedimg/) {
173 $self->{'convert_to'} = "PagedImg";
174 my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
175 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
176 $self->{'convert_to_ext'} = $convert_to_ext;
177 }
178
179 return bless $self, $class;
180}
181
182
183sub init {
184 my $self = shift (@_);
185 my ($verbosity, $outhandle, $failhandle) = @_;
186
187 $self->SUPER::init($verbosity,$outhandle,$failhandle);
188
189 my $secondary_plugins = $self->{'secondary_plugins'};
190
191 foreach my $plug_name (keys %$secondary_plugins) {
192 my $plugin = $secondary_plugins->{$plug_name};
193 $plugin->init($verbosity,$outhandle,$failhandle);
194 }
195}
196
197sub deinit {
198 # called only once, after all plugin passes have been done
199
200 my ($self) = @_;
201
202 my $secondary_plugins = $self->{'secondary_plugins'};
203
204 foreach my $plug_name (keys %$secondary_plugins) {
205 my $plugin = $secondary_plugins->{$plug_name};
206 $plugin->deinit();
207 }
208}
209
210sub convert_post_process
211{
212 # by default do no post processing
213 return;
214}
215
216
217# Run conversion utility on the input file.
218#
219# The conversion takes place in a collection specific 'tmp' directory so
220# that we don't accidentally damage the input.
221#
222# The desired output type is indicated by $output_ext. This is usually
223# something like "html" or "word", but can be "best" (or the empty string)
224# to indicate that the conversion utility should do the best it can.
225sub tmp_area_convert_file {
226 my $self = shift (@_);
227 my ($output_ext, $input_filename, $textref) = @_;
228
229 my $outhandle = $self->{'outhandle'};
230 my $convert_to = $self->{'convert_to'};
231 my $failhandle = $self->{'failhandle'};
232 my $convert_to_ext = $self->{'convert_to_ext'};
233
234 # softlink to collection tmp dir
235 my $tmp_dirname
236 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
237 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
238
239 # derive tmp filename from input filename
240 my ($tailname, $dirname, $suffix)
241 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
242
243 # Remove any white space from filename -- no risk of name collision, and
244 # makes later conversion by utils simpler. Leave spaces in path...
245 # tidy up the filename with space, dot, hyphen between
246 $tailname =~ s/\s+//g;
247 $tailname =~ s/\.+//g;
248 $tailname =~ s/\-+//g;
249 $suffix = lc($suffix);
250 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
251 &util::soft_link($input_filename, $tmp_filename);
252 my $verbosity = $self->{'verbosity'};
253 if ($verbosity > 0) {
254 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
255 }
256
257 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
258
259 # Execute the conversion command and get the type of the result,
260 # making sure the converter gives us the appropriate output type
261 my $output_type="";
262 if ($convert_to =~ m/PagedImg/i) {
263 $output_type = lc($convert_to)."_".lc($convert_to_ext);
264 } else {
265 $output_type = lc($convert_to);
266 }
267
268 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
269 if (defined $self->{'convert_options'}) {
270 $cmd .= $self->{'convert_options'} . " ";
271 }
272 if ($self->{'use_strings'}) {
273 $cmd .= "-use_strings ";
274 }
275 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
276
277 $output_type = `$cmd`;
278
279 # remove symbolic link to original file
280 &util::rm($tmp_filename);
281
282 # Check STDERR here
283 chomp $output_type;
284 if ($output_type eq "fail") {
285 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
286 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
287 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
288 #$self->{'num_not_processed'} ++;
289 if (-s "$errlog") {
290 open(ERRLOG, "$errlog");
291 while (<ERRLOG>) {
292 print $outhandle "$_";
293 }
294 print $outhandle "\n";
295 close ERRLOG;
296 }
297 &util::rm("$errlog") if (-e "$errlog");
298 return "";
299 }
300
301 # store the *actual* output type and return the output filename
302 # it's possible we requested conversion to html, but only to text succeeded
303 #$self->{'convert_to_ext'} = $output_type;
304 if ($output_type =~ /html/i) {
305 $self->{'converted_to'} = "HTML";
306 } elsif ($output_type =~ /te?xt/i) {
307 $self->{'converted_to'} = "TEXT";
308 } elsif ($output_type =~ /item/i){
309 $self->{'converted_to'} = "PagedImg";
310 }
311
312 my $output_filename = $tmp_filename;
313 if ($output_type =~ /item/i) {
314 # running under windows
315 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
316 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
317 } else {
318 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
319 }
320 } else {
321 $output_filename =~ s/$suffix$/.$output_type/;
322 }
323 return $output_filename;
324}
325
326
327# Override BasPlug read
328# We don't want to get language encoding stuff until after we've converted
329# our file to either TEXT or HTML or PagedImage.
330sub read {
331 my $self = shift (@_);
332 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
333 #if ($self->is_recursive()) {
334 # die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
335 # }
336
337 my $outhandle = $self->{'outhandle'};
338
339 my ($block_status,$filename) = $self->read_block(@_);
340 return $block_status if ((!defined $block_status) || ($block_status==0));
341 $file = $self->read_tidy_file($file);
342
343 my $output_ext = $self->{'convert_to_ext'};
344 my $conv_filename = "";
345 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
346
347 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
348 if (! -e "$conv_filename") {return -1;}
349 $self->{'conv_filename'} = $conv_filename;
350 $self->convert_post_process($conv_filename);
351
352 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
353 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
354 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|TEXT)/) {
355 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
356 if (system($fribidi_command) != 0) {
357 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
358 }
359 else {
360 &util::mv("${conv_filename}.tmp", $conv_filename);
361 }
362 }
363
364 my $secondary_plugins = $self->{'secondary_plugins'};
365 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
366
367 if ($num_secondary_plugins == 0) {
368 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
369 return 0; # effectively block it
370 }
371
372 my @plugin_names = keys %$secondary_plugins;
373 my $plugin_name = shift @plugin_names;
374
375 if ($num_secondary_plugins > 1) {
376 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
377 }
378
379 my $secondary_plugin = $secondary_plugins->{$plugin_name};
380
381 # note: metadata is not carried on to the next level
382 my ($rv,$doc_obj)
383 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
384 $metadata, $processor, $maxdocs, $total_count,
385 $gli);
386
387 if ((!defined $rv) || ($rv<1)) {
388 # wasn't processed
389 return $rv;
390 }
391
392 # Override previous gsdlsourcefilename set by secondary plugin
393 my $collect_file = &util::filename_within_collection($filename);
394 my $collect_conv_file = &util::filename_within_collection($conv_filename);
395 $doc_obj->set_source_filename ($collect_file);
396 $doc_obj->set_converted_filename($collect_conv_file);
397
398 my ($filemeta) = $file =~ /([^\\\/]+)$/;
399 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
400 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
401 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
402
403 if ($self->{'cover_image'}) {
404 $self->associate_cover_image($doc_obj, $filename);
405 }
406
407 # do plugin specific processing of doc_obj
408 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
409 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
410 return -1;
411 }
412 # do any automatic metadata extraction
413 $self->auto_extract_metadata ($doc_obj);
414
415 # have we found a Title??
416 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
417
418 # add an OID
419 $doc_obj->set_OID();
420 # process the document
421 $processor->process($doc_obj);
422
423 $self->{'num_processed'} ++;
424
425 return 1;
426}
427
428
429# do plugin specific processing of doc_obj for doc_ext type
430sub process_type {
431 my $self = shift (@_);
432 my ($doc_ext, $base_dir, $file, $doc_obj) = @_;
433
434 # associate original file with doc object
435 my $cursection = $doc_obj->get_top_section();
436 my $filename = &util::filename_cat($base_dir, $file);
437 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
438
439 my $file_type;
440
441 if ($doc_ext eq "doc") {
442 $file_type = "Word";
443 } elsif ($doc_ext eq "xls") {
444 $file_type = "Excel";
445 } elsif ($doc_ext eq "ppt") {
446 $file_type = "PPT";
447 } elsif ($doc_ext eq "pdf") {
448 $file_type = "PDF";
449 } elsif ($doc_ext eq "rtf") {
450 $file_type = "RTF";
451 } elsif ($doc_ext eq "ps") {
452 $file_type = "PS";
453 }
454
455 my $file_format = $file_type || "unknown";
456
457 # We use set instead of add here because we only want one value
458 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
459
460 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
461 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
462 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
463 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
464
465 return 1;
466}
467
4681;
469
470
471
472
473
474
475
Note: See TracBrowser for help on using the repository browser.