source: gsdl/trunk/perllib/plugins/ConvertToPlug.pm@ 15151

Last change on this file since 15151 was 15151, checked in by ak19, 16 years ago

Commented out regular expressions that remove periods, hyphens and spaces from filenames to be converted. Need to retain the same output filename as was input

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from BasPlug
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug,
28# RTFPlug and PDFPlug. It facilitates the conversion of these document types
29# to either HTML, TEXT or a series of images. It works by dynamically loading
30# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
31# PagedImgPlug or TEXTPlug) based on the plugin argument 'convert_to'.
32
33package ConvertToPlug;
34
35use BasPlug;
36use ghtml;
37use HTMLPlug;
38use TEXTPlug;
39use PagedImgPlug;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43no strict 'subs';
44sub BEGIN {
45 @ConvertToPlug::ISA = ('BasPlug');
46}
47
48my $convert_to_list =
49 [ { 'name' => "auto",
50 'desc' => "{ConvertToPlug.convert_to.auto}" },
51 { 'name' => "html",
52 'desc' => "{ConvertToPlug.convert_to.html}" },
53 { 'name' => "text",
54 'desc' => "{ConvertToPlug.convert_to.text}" }
55 ];
56
57my $arguments =
58 [ { 'name' => "convert_to",
59 'desc' => "{ConvertToPlug.convert_to}",
60 'type' => "enum",
61 'reqd' => "yes",
62 'list' => $convert_to_list,
63 'deft' => "auto" },
64 { 'name' => "keep_original_filename",
65 'desc' => "{ConvertToPlug.keep_original_filename}",
66 'type' => "flag" },
67 { 'name' => "title_sub",
68 'desc' => "{HTMLPlug.title_sub}",
69 'type' => "string",
70 #'type' => "regexp",
71 'deft' => "" },
72 { 'name' => "apply_fribidi",
73 'desc' => "{ConvertToPlug.apply_fribidi}",
74 'type' => "flag",
75 'reqd' => "no" },
76 { 'name' => "use_strings",
77 'desc' => "{ConvertToPlug.use_strings}",
78 'type' => "flag",
79 'reqd' => "no" },
80 { 'name' => "extract_keyphrases",
81 'desc' => "{BasPlug.extract_keyphrases}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes" },
85 { 'name' => "extract_keyphrase_options",
86 'desc' => "{BasPlug.extract_keyphrase_options}",
87 'type' => "string",
88 'reqd' => "no",
89 'hiddengli' => "yes" } ];
90
91my $options = { 'name' => "ConvertToPlug",
92 'desc' => "{ConvertToPlug.desc}",
93 'abstract' => "yes",
94 'inherits' => "yes",
95 'args' => $arguments };
96
97
98sub load_secondary_plugins
99{
100 my $self = shift (@_);
101 my ($class,$input_args,$hashArgOptLists) = @_;
102
103 my @convert_to_list = split(",",$self->{'convert_to'});
104 my $secondary_plugins = {};
105 # find the plugin
106
107 foreach my $convert_to (@convert_to_list) {
108 # load in "convert_to" plugin package
109 my $plugin_class = $convert_to."Plug";
110 my $plugin_package = $plugin_class.".pm";
111
112 my $colplugname = undef;
113 if (defined $ENV{'GSDLCOLLECTDIR'}) {
114 $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
115 "perllib","plugins",
116 $plugin_package);
117 }
118
119 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},
120 "perllib","plugins",
121 $plugin_package);
122
123 if ((defined $colplugname) && (-e $colplugname)) { require $colplugname;}
124 elsif (-e $mainplugname) { require $mainplugname; }
125 else {
126 &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n",
127 $plugin_class);
128 die "\n";
129 }
130
131 # call its constructor with extra options that we've worked out!
132 my $arglist = $input_args->{$plugin_class};
133
134 my ($secondary_plugin);
135 eval("\$secondary_plugin = new $plugin_class([],\$arglist)");
136 die "$@" if $@;
137 $secondary_plugins->{$plugin_class} = $secondary_plugin;
138 }
139 $self->{'secondary_plugins'} = $secondary_plugins;
140}
141
142sub new {
143 my ($class) = shift (@_);
144 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
145 push(@$pluginlist, $class);
146 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
147 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
148 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
149
150 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
151
152 if ($self->{'info_only'}) {
153 # don't worry about any options etc
154 return bless $self, $class;
155 }
156
157 my $convert_to_type = $self->{'convert_to'};
158 if (!defined $convert_to_type || $convert_to_type eq "") {
159 $convert_to_type = "auto";
160 }
161 my $windows_scripting = $self->{'windows_scripting'};
162 $windows_scripting = 0 unless defined $windows_scripting;
163 if ($classPluginName eq "PDFPlug") {
164 if ($convert_to_type eq "text" &&
165 $ENV{'GSDLOS'} =~ /^windows$/i) {
166 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
167 $convert_to_type = "html";
168 }
169 } elsif ($classPluginName eq "WordPlug") {
170 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) {
171 # we use structured HTML, not normal html
172 $convert_to_type = "structuredhtml";
173 }
174 } elsif ($classPluginName eq "PPTPlug") {
175 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
176 # we use paged img
177 $convert_to_type = "pagedimg_jpg";
178 }
179 } elsif ($classPluginName eq "PSPlug") {
180 if ($convert_to_type eq "auto") {
181 # we use text
182 $convert_to_type = "text";
183 }
184 }
185
186 if ($convert_to_type eq "auto") {
187 # choose html for now - should choose a format based on doc type
188 $convert_to_type = "html";
189 }
190
191 if ($convert_to_type eq "html") {
192 $self->{'convert_to'} = "HTML";
193 $self->{'convert_to_ext'} = "html";
194 } elsif ($convert_to_type eq "text") {
195 $self->{'convert_to'} = "TEXT";
196 $self->{'convert_to_ext'} = "txt";
197 } elsif ($convert_to_type eq "structuredhtml") {
198 $self->{'convert_to'} = "StructuredHTML";
199 $self->{'convert_to_ext'} = "html";
200 } elsif ($convert_to_type =~ /^pagedimg/) {
201 $self->{'convert_to'} = "PagedImg";
202 my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
203 $convert_to_ext = 'jpg' unless defined $convert_to_ext;
204 $self->{'convert_to_ext'} = $convert_to_ext;
205 }
206
207 return bless $self, $class;
208}
209
210
211sub init {
212 my $self = shift (@_);
213 my ($verbosity, $outhandle, $failhandle) = @_;
214
215 $self->SUPER::init($verbosity,$outhandle,$failhandle);
216
217 my $secondary_plugins = $self->{'secondary_plugins'};
218
219 foreach my $plug_name (keys %$secondary_plugins) {
220 my $plugin = $secondary_plugins->{$plug_name};
221 $plugin->init($verbosity,$outhandle,$failhandle);
222 }
223}
224
225sub deinit {
226 # called only once, after all plugin passes have been done
227
228 my ($self) = @_;
229
230 my $secondary_plugins = $self->{'secondary_plugins'};
231
232 foreach my $plug_name (keys %$secondary_plugins) {
233 my $plugin = $secondary_plugins->{$plug_name};
234 $plugin->deinit();
235 }
236}
237
238sub convert_post_process
239{
240 # by default do no post processing
241 return;
242}
243
244
245# Run conversion utility on the input file.
246#
247# The conversion takes place in a collection specific 'tmp' directory so
248# that we don't accidentally damage the input.
249#
250# The desired output type is indicated by $output_ext. This is usually
251# something like "html" or "word", but can be "best" (or the empty string)
252# to indicate that the conversion utility should do the best it can.
253sub tmp_area_convert_file {
254 my $self = shift (@_);
255 my ($output_ext, $input_filename, $textref) = @_;
256
257 my $outhandle = $self->{'outhandle'};
258 my $convert_to = $self->{'convert_to'};
259 my $failhandle = $self->{'failhandle'};
260 my $convert_to_ext = $self->{'convert_to_ext'};
261
262 # derive tmp filename from input filename
263 my ($tailname, $dirname, $suffix)
264 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
265
266 # softlink to collection tmp dir
267 my $tmp_dirname = $dirname;
268 if(defined $ENV{'GSDLCOLLECTDIR'}) {
269 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
270 } elsif(defined $ENV{'GSDLHOME'}) {
271 $tmp_dirname = $ENV{'GSDLHOME'};
272 }
273 $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
274 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
275
276 # The following is not necessary and will cause problems with
277 # replacing_srcdoc_with_html in the GSDLremote case:
278 # Remove any white space from filename -- no risk of name collision, and
279 # makes later conversion by utils simpler. Leave spaces in path...
280 # tidy up the filename with space, dot, hyphen between
281 #$tailname =~ s/\s+//g;
282 #$tailname =~ s/\.+//g;
283 #$tailname =~ s/\-+//g;
284
285 # convert to utf-8 otherwise we have problems with the doc.xml file
286 # later on
287 &unicode::ensure_utf8(\$tailname);
288
289 $suffix = lc($suffix);
290 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
291 &util::soft_link($input_filename, $tmp_filename);
292 my $verbosity = $self->{'verbosity'};
293 if ($verbosity > 0) {
294 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
295 }
296
297 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
298
299 # Execute the conversion command and get the type of the result,
300 # making sure the converter gives us the appropriate output type
301 my $output_type="";
302 if ($convert_to =~ m/PagedImg/i) {
303 $output_type = lc($convert_to)."_".lc($convert_to_ext);
304 } else {
305 $output_type = lc($convert_to);
306 }
307
308 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
309 if (defined $self->{'convert_options'}) {
310 $cmd .= $self->{'convert_options'} . " ";
311 }
312 if ($self->{'use_strings'}) {
313 $cmd .= "-use_strings ";
314 }
315 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
316 $output_type = `$cmd`;
317
318 # remove symbolic link to original file
319 &util::rm($tmp_filename);
320
321 # Check STDERR here
322 chomp $output_type;
323 if ($output_type eq "fail") {
324 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
325 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
326 # The following meant that if a conversion failed, the document would be counted twice - do we need it for anything?
327 #$self->{'num_not_processed'} ++;
328 if (-s "$errlog") {
329 open(ERRLOG, "$errlog");
330 while (<ERRLOG>) {
331 print $outhandle "$_";
332 }
333 print $outhandle "\n";
334 close ERRLOG;
335 }
336 &util::rm("$errlog") if (-e "$errlog");
337 return "";
338 }
339
340 # store the *actual* output type and return the output filename
341 # it's possible we requested conversion to html, but only to text succeeded
342 #$self->{'convert_to_ext'} = $output_type;
343 if ($output_type =~ /html/i) {
344 $self->{'converted_to'} = "HTML";
345 } elsif ($output_type =~ /te?xt/i) {
346 $self->{'converted_to'} = "TEXT";
347 } elsif ($output_type =~ /item/i){
348 $self->{'converted_to'} = "PagedImg";
349 }
350
351 my $output_filename = $tmp_filename;
352 if ($output_type =~ /item/i) {
353 # running under windows
354 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
355 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
356 } else {
357 $output_filename = $tmp_dirname . "\/$tailname\/" . $tailname . ".$output_type";
358 }
359 } else {
360 $output_filename =~ s/$suffix$/.$output_type/;
361 }
362
363 return $output_filename;
364}
365
366
367# Override BasPlug read
368# We don't want to get language encoding stuff until after we've converted
369# our file to either TEXT or HTML or PagedImage.
370sub read {
371 my $self = shift (@_);
372 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
373
374 my $outhandle = $self->{'outhandle'};
375
376 my ($block_status,$filename) = $self->read_block(@_);
377 return $block_status if ((!defined $block_status) || ($block_status==0));
378 $file = $self->read_tidy_file($file);
379
380 my $output_ext = $self->{'convert_to_ext'};
381 my $conv_filename = "";
382 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
383
384 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
385 if (! -e "$conv_filename") {return -1;}
386 $self->{'conv_filename'} = $conv_filename;
387 $self->convert_post_process($conv_filename);
388
389 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
390 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
391 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|TEXT)/) {
392 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
393 if (system($fribidi_command) != 0) {
394 print STDERR "ERROR: Cannot run fribidi on \"$conv_filename\".\n";
395 }
396 else {
397 &util::mv("${conv_filename}.tmp", $conv_filename);
398 }
399 }
400
401 my $secondary_plugins = $self->{'secondary_plugins'};
402 my $num_secondary_plugins = scalar(keys %$secondary_plugins);
403
404 if ($num_secondary_plugins == 0) {
405 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n";
406 return 0; # effectively block it
407 }
408
409 my @plugin_names = keys %$secondary_plugins;
410 my $plugin_name = shift @plugin_names;
411
412 if ($num_secondary_plugins > 1) {
413 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n.";
414 }
415
416 my $secondary_plugin = $secondary_plugins->{$plugin_name};
417
418 # note: metadata is not carried on to the next level
419 my ($rv,$doc_obj)
420 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
421 $metadata, $processor, $maxdocs, $total_count,
422 $gli);
423
424 if ((!defined $rv) || ($rv<1)) {
425 # wasn't processed
426 return $rv;
427 }
428
429 # Override previous gsdlsourcefilename set by secondary plugin
430 my $collect_file = &util::filename_within_collection($filename);
431 my $collect_conv_file = &util::filename_within_collection($conv_filename);
432 $doc_obj->set_source_filename ($collect_file);
433 $doc_obj->set_converted_filename($collect_conv_file);
434
435 my ($filemeta) = $file =~ /([^\\\/]+)$/;
436 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
437 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
438 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
439
440 if ($self->{'cover_image'}) {
441 $self->associate_cover_image($doc_obj, $filename);
442 }
443
444 # do plugin specific processing of doc_obj
445 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
446 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
447 return -1;
448 }
449 # do any automatic metadata extraction
450 $self->auto_extract_metadata ($doc_obj);
451
452 # have we found a Title??
453 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
454
455# # add an OID
456# $doc_obj->set_OID();
457
458 # add an OID
459 # see if there is a plugin-specific set_OID function...
460 if (defined ($self->can('set_OID'))) {
461 # it will need $doc_obj to set the Identifier metadata...
462 $self->set_OID($doc_obj);
463 } else {
464 # use the default set_OID() in doc.pm
465 $doc_obj->set_OID();
466 }
467
468
469 # process the document
470 $processor->process($doc_obj);
471
472 $self->{'num_processed'} ++;
473
474 return 1;
475}
476
477
478# do plugin specific processing of doc_obj for doc_ext type
479sub process_type {
480 my $self = shift (@_);
481 my ($doc_ext, $base_dir, $file, $doc_obj) = @_;
482
483 # associate original file with doc object
484 my $cursection = $doc_obj->get_top_section();
485 my $filename = &util::filename_cat($base_dir, $file);
486 my $assocfilename = "doc.$doc_ext";
487 if ($self->{'keep_original_filename'} == 1) {
488 # this should be the same filename that was used for the Source metadata, as we will use [Source] in the srclink
489 ($assocfilename) = $file =~ /([^\\\/]+)$/;
490 }
491 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
492
493 my $file_type;
494
495 if ($doc_ext eq "doc") {
496 $file_type = "Word";
497 } elsif ($doc_ext eq "xls") {
498 $file_type = "Excel";
499 } elsif ($doc_ext eq "ppt") {
500 $file_type = "PPT";
501 } elsif ($doc_ext eq "pdf") {
502 $file_type = "PDF";
503 } elsif ($doc_ext eq "rtf") {
504 $file_type = "RTF";
505 } elsif ($doc_ext eq "ps") {
506 $file_type = "PS";
507 }
508
509 my $file_format = $file_type || "unknown";
510
511 # We use set instead of add here because we only want one value
512 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
513 my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">";
514 if ($self->{'keep_original_filename'} == 1) {
515 $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/[Source]\">";
516 }
517 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
518 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
519 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
520
521 return 1;
522}
523
5241;
525
526
527
528
529
530
531
Note: See TracBrowser for help on using the repository browser.