source: main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm@ 31742

Last change on this file since 31742 was 31742, checked in by ak19, 7 years ago

No need to hardcode the plugin name

  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
RevLine 
[2981]1###########################################################################
2#
[17722]3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
[22709]4# (basic version supports versions 95 and 97)
[22861]5# (through OpenOffice extension, supports all contemporary formats)
[2981]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
[17722]29package PowerPointPlugin;
[2981]30
[10254]31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
[22515]33no strict 'subs';
[22705]34
[22515]35use gsprintf 'gsprintf';
[2981]36
[22861]37use AutoLoadConverters;
38use ConvertBinaryFile;
[2981]39
[22861]40sub BEGIN {
41 @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
[22705]43
[22861]44my $openoffice_available = 0;
[22705]45
[22515]46my $windows_convert_to_list =
[10466]47 [ { 'name' => "auto",
[15872]48 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10466]49 { 'name' => "html",
[15872]50 'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10466]51 { 'name' => "text",
[15872]52 'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10466]53 { 'name' => "pagedimg_jpg",
[22882]54 'desc' => "{PowerPointPlugin.convert_to.pagedimg_jpg}" },
[10466]55 { 'name' => "pagedimg_gif",
[22882]56 'desc' => "{PowerPointPlugin.convert_to.pagedimg_gif}" },
[10466]57 { 'name' => "pagedimg_png",
[22882]58 'desc' => "{PowerPointPlugin.convert_to.pagedimg_png}" }
[10466]59 ];
60
[22861]61my $openoffice_convert_to_list =
62 [ { 'name' => "auto",
63 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[22880]64 { 'name' => "html_multi",
65 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
[22861]66 { 'name' => "text",
67 'desc' => "{ConvertBinaryFile.convert_to.text}" },
68 { 'name' => "pagedimg",
69 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70 ];
71
[22882]72my $openoffice_extra_convert_to_list =
73 [ { 'name' => "html_multi",
74 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
75 { 'name' => "pagedimg",
76 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
77 ];
78
[6408]79my $arguments =
80 [ { 'name' => "process_exp",
[31492]81 'desc' => "{BaseImporter.process_exp}",
[6408]82 'type' => "regexp",
83 'reqd' => "no",
[22861]84 'deft' => "&get_default_process_exp()", # delayed (see below)
[22709]85 }
[6408]86 ];
87
[22515]88my $opt_windows_args =
89 [ { 'name' => "convert_to",
90 'desc' => "{ConvertBinaryFile.convert_to}",
91 'type' => "enum",
92 'reqd' => "yes",
93 'list' => $windows_convert_to_list,
94 'deft' => "html" },
95 { 'name' => "windows_scripting",
96 'desc' => "{PowerPointPlugin.windows_scripting}",
97 'type' => "flag",
98 'reqd' => "no" }
99 ];
100
[22861]101my $opt_office_args =
102 [ { 'name' => "convert_to",
103 'desc' => "{ConvertBinaryFile.convert_to}",
104 'type' => "enum",
105 'reqd' => "yes",
106 'list' => $openoffice_convert_to_list,
107 'deft' => "html" }
108 ];
109
[17722]110my $options = { 'name' => "PowerPointPlugin",
[17744]111 'desc' => "{PowerPointPlugin.desc}",
[6408]112 'abstract' => "no",
[11679]113 'inherits' => "yes",
[15114]114 'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
[6408]115 'args' => $arguments };
[4744]116
[2981]117sub new {
[10218]118 my ($class) = shift (@_);
119 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
120 push(@$pluginlist, $class);
[2981]121
[22861]122 # this bit needs to happen later after the arguments array has been
123 # finished - used for parsing the input args.
124 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
125 # this one needs to go in first, to get the print info in the right order
126 push(@{$hashArgOptLists->{"OptList"}},$options);
127
[22882]128 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
129
[10275]130 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[22882]131 if ($auto_converter_self->{'openoffice_available'}) {
132 # add openoffice convert_to options into list
133 push (@$windows_convert_to_list, @$openoffice_extra_convert_to_list);
[25192]134 $openoffice_available = 1;
[22882]135 }
[22515]136 push(@$arguments,@$opt_windows_args);
[10275]137 }
[22882]138 elsif ($auto_converter_self->{'openoffice_available'}) {
[22861]139 push (@$arguments,@$opt_office_args);
140 $openoffice_available = 1;
141 }
142 # TODO need to do the case where they are both enabled!!! what will the convert to list be???
[10427]143
[22861]144 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
145 foreach my $a (@$arguments) {
[22709]146 if ($a->{'name'} eq "process_exp") {
147 my $eval_expr = $a->{'deft'};
148 $a->{'deft'} = eval "$eval_expr";
[22861]149 last;
[22709]150 }
151 }
152
[22861]153 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
154
155 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[31492]156 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
[22861]157
[10580]158 if ($self->{'info_only'}) {
159 # don't worry about any options etc
160 return bless $self, $class;
161 }
162
[22861]163 $self = bless $self, $class;
[15872]164 $self->{'file_type'} = "PPT";
165
[22597]166 if ($self->{'convert_to'} eq "auto") {
167 if ($self->{'windows_scripting'}) {
168 $self->{'convert_to'} = "pagedimg_jpg";
169 }
170 else {
171 $self->{'convert_to'} = "html";
172 }
173 }
174
175 my $outhandle = $self->{'outhandle'};
176
[22861]177 # can't have windows_scripting and openoffice_conversion at the same time
178 if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
179 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
[22515]180 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
[22861]181 $self->{'openoffice_conversion'} = 0;
[22515]182 }
183
[15872]184 #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10491]185 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
[22597]186
187 # set convert_to_plugin and convert_to_ext
[22640]188 $self->set_standard_convert_settings();
[22597]189
190 my $secondary_plugin_name = $self->{'convert_to_plugin'};
[10275]191 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10218]192
[22597]193 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
194 $secondary_plugin_options->{$secondary_plugin_name} = [];
[10275]195 }
[22597]196 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
197
198 push(@$specific_options, "-file_rename_method", "none");
199 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
200
201 if ($secondary_plugin_name eq "HTMLPlugin") {
202 push(@$specific_options, "-processing_tmp_files");
203 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
[18406]204 }
[22597]205 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
206 push(@$specific_options, "-processing_tmp_files");
[21958]207 #is this true??
[22597]208 push(@$specific_options,"-input_encoding", "utf8");
[22871]209 if ($self->{'openoffice_conversion'}) {
210 push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
211 }
[15903]212 }
[2981]213
[10275]214 $self = bless $self, $class;
215
[10427]216 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10275]217 return $self;
[2981]218}
219
220sub get_default_process_exp {
221 my $self = shift (@_);
[22709]222
[22861]223 if ($openoffice_available) {
[22709]224 return q^(?i)\.(ppt|pptx|odp)$^;
225 }
226
[2981]227 return q^(?i)\.ppt$^;
228}
[10275]229
[22861]230sub init {
231 my $self = shift (@_);
232
233 # ConvertBinaryFile init
234 $self->SUPER::init(@_);
[23751]235 $self->AutoLoadConverters::init(@_);
[22861]236
237}
238
239sub begin {
240 my $self = shift (@_);
241
[23751]242 $self->AutoLoadConverters::begin(@_);
[22861]243 $self->SUPER::begin(@_);
244
245}
246
247sub deinit {
248 my $self = shift (@_);
249
[23751]250 $self->AutoLoadConverters::deinit(@_);
[22861]251 $self->SUPER::deinit(@_);
252
253}
254
255# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
256sub tmp_area_convert_file {
257 my $self = shift (@_);
258 my ($output_ext, $input_filename, $textref) = @_;
259
260 if ($self->{'openoffice_conversion'}) {
261 if ($self->{'convert_to'} eq "pagedimg") {
262 $output_ext = "html"; # first convert to html
263 }
264 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
265 if ($result == 0) {
266 my $outhandle=$self->{'outhandle'};
267 print $outhandle "OpenOfficeConverter Conversion error\n";
268 print $outhandle $result_str;
269 return "";
270
271 }
272 #print STDERR "result = $result\n";
273 if ($self->{'convert_to'} eq "pagedimg") {
[22871]274 my $item_filename = $self->generate_item_file($new_filename);
275 return $item_filename;
[22861]276 }
277 return $new_filename;
278
279 }
280 else {
281 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
282 }
283 # get tmp filename
284}
285
286# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
[22880]287sub read {
[22861]288 my $self = shift (@_);
289 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
290
291 # can we process this file??
292 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
293
294 return undef unless $self->can_process_this_file($filename_full_path);
295
[22880]296 # we are only doing something special for html_multi
297 if (!($self->{'openoffice_conversion'} && $self->{'convert_to'} eq "html_multi")) {
[31492]298 return $self->BaseImporter::read(@_);
[22880]299 }
300 my $outhandle = $self->{'outhandle'};
301 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
302 print $outhandle "$self->{'plugin_type'} processing $file\n"
303 if $self->{'verbosity'} > 1;
304
305 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path);
306 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
307 if (! -e "$conv_filename") {return -1;}
308
309 my ($tailname, $html_dirname, $suffix)
310 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
311
312 my $collect_file = &util::filename_within_collection($filename_full_path);
313 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
314 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
315
316 my @dir;
317 if (!opendir (DIR, $html_dirname)) {
318 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
319 # just process the original file
320 @dir = ("$tailname.$suffix");
321
322 } else {
323 @dir = readdir (DIR);
324 closedir (DIR);
325 }
326
327 foreach my $file (@dir) {
328 next unless $file =~ /\.html$/;
329
330 my ($rv, $doc_obj) =
331 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
332 if ((!defined $rv) || ($rv<1)) {
333 # wasn't processed
334 return $rv;
335 }
336
337 # next block copied from ConvertBinaryFile
338 # from here ...
339 # Override previous gsdlsourcefilename set by secondary plugin
340
341 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
342 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
343 # build. so set it manually.
[23363]344 $doc_obj->set_source_path($filename_full_path);
[22880]345 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
346
[23349]347 my $plugin_filename_encoding = $self->{'filename_encoding'};
348 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23352]349 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
[22880]350
351 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
352 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
353
354
355 my ($tailname, $dirname, $suffix)
356 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
357 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
358
359
360 my $topsection = $doc_obj->get_top_section();
361 $self->add_associated_files($doc_obj, $filename_full_path);
362
363 # extra_metadata is already called by sec plugin in process??
364 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
365 # do any automatic metadata extraction
366 $self->auto_extract_metadata ($doc_obj);
367
368 # have we found a Title??
369 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
370
371 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
372 #$self->add_OID($doc_obj);
373 # to here...
374
375 # process it
376 $processor->process($doc_obj);
377 undef $doc_obj;
378 }
379 $self->{'num_processed'} ++;
380
381# my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[22861]382
[22880]383# if ((defined $process_status) && ($process_status == 1)) {
[22861]384
385 # process the document
[22880]386# $processor->process($doc_obj);
[22861]387
[22880]388# $self->{'num_processed'} ++;
389# undef $doc_obj;
390# }
[22861]391 # delete any temp files that we may have created
392 $self->clean_up_after_doc_obj_processing();
393
394
395 # if process_status == 1, then the file has been processed.
[22880]396 return 1;
[22861]397
398}
399
[22871]400sub generate_item_file
401{
402 my $self = shift(@_);
403 my ($input_filename) = @_;
404 my $outhandle = $self->{'outhandle'};
405 my ($tailname, $dirname, $suffix)
406 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
407
[31742]408 my $plugin_name = $self->{'plugin_type'};
409
[22871]410 # find all the files in the directory
411 if (!opendir (DIR, $dirname)) {
[31742]412 print $outhandle "$plugin_name: Couldn't read directory $dirname\n";
[22871]413 return $input_filename;
414 }
415
416 my @dir = readdir (DIR);
417 closedir (DIR);
418
419 # start the item file
420 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
421
422 # encoding specification????
423 if (!open (ITEMFILE, ">$itemfile_name")) {
[31742]424 print $outhandle "$plugin_name: Couldn't open $itemfile_name for writing\n";
[22871]425 }
[31742]426 print ITEMFILE "<GeneratedBy>$plugin_name\n";
[22871]427 # print the first page
428 my @sorted_dir = sort alphanum_sort @dir;
429 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
430 my $file = $sorted_dir[$i];
431 if ($file =~ /^img(\d+)\.jpg$/) {
432 my $num = $1;
433 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
434 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
435 }
436 }
437 close ITEMFILE;
438 return $itemfile_name;
439
440
441}
442
443# want to sort img1, img2, ...img10, img11 etc.
444sub alphanum_sort {
445
446 my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
447 my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
448
449 if ($a_txt ne $b_txt) { return ($a cmp $b) };
450 return ($a_num <=> $b_num);
451}
452
453# Want to remove the line that links to first page, last page, next page, text etc.
454sub tidy_up_html {
455
456 my $self = shift(@_);
457 my ($filename) = @_;
458 return unless (-f $filename);
459 my $backup_filename = "$filename.bak";
460
461 &File::Copy::copy($filename, $backup_filename);
462
463 open (ORIGINAL, $backup_filename) || return;
464 open(HTMLFILE, ">$filename") || return;
465
466 my $line ="";
467 while ($line = <ORIGINAL>) {
468 if ($line =~ /\<body\>/) {
469 print HTMLFILE $line;
470 $line = <ORIGINAL>;
471 next if $line =~ /\<center\>/;
472 }
473 next if $line =~ /First page/;
474 print HTMLFILE ($line);
475 }
476
477 close HTMLFILE;
478 close ORIGINAL;
479}
[2981]4801;
[10275]481
Note: See TracBrowser for help on using the repository browser.