source: main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm@ 22880

Last change on this file since 22880 was 22880, checked in by kjdon, 14 years ago

implemented the read method for when using open office to convert to html multi - the powerpoint gets converted to individual html files, two per slide. one for the image, one for the text. each one gets passed to HTMLPlugin for processing, so all the slides end up as individual documents, but the first page, back, continue etc links work to link them all together

  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
RevLine 
[2981]1###########################################################################
2#
[17722]3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
[22709]4# (basic version supports versions 95 and 97)
[22861]5# (through OpenOffice extension, supports all contemporary formats)
[2981]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
[17722]29package PowerPointPlugin;
[2981]30
[10254]31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
[22515]33no strict 'subs';
[22705]34
[22515]35use gsprintf 'gsprintf';
[2981]36
[22861]37use AutoLoadConverters;
38use ConvertBinaryFile;
[2981]39
[22861]40sub BEGIN {
41 @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
[22705]43
[22861]44my $openoffice_available = 0;
[22705]45
[22515]46my $windows_convert_to_list =
[10466]47 [ { 'name' => "auto",
[15872]48 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10466]49 { 'name' => "html",
[15872]50 'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10466]51 { 'name' => "text",
[15872]52 'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10466]53 { 'name' => "pagedimg_jpg",
[15872]54 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
[10466]55 { 'name' => "pagedimg_gif",
[15872]56 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
[10466]57 { 'name' => "pagedimg_png",
[15872]58 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
[10466]59 ];
60
[22861]61my $openoffice_convert_to_list =
62 [ { 'name' => "auto",
63 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[22880]64 { 'name' => "html_multi",
65 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
[22861]66 { 'name' => "text",
67 'desc' => "{ConvertBinaryFile.convert_to.text}" },
68 { 'name' => "pagedimg",
69 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70 ];
71
[6408]72my $arguments =
73 [ { 'name' => "process_exp",
[15872]74 'desc' => "{BasePlugin.process_exp}",
[6408]75 'type' => "regexp",
76 'reqd' => "no",
[22861]77 'deft' => "&get_default_process_exp()", # delayed (see below)
[22709]78 }
[6408]79 ];
80
[22515]81my $opt_windows_args =
82 [ { 'name' => "convert_to",
83 'desc' => "{ConvertBinaryFile.convert_to}",
84 'type' => "enum",
85 'reqd' => "yes",
86 'list' => $windows_convert_to_list,
87 'deft' => "html" },
88 { 'name' => "windows_scripting",
89 'desc' => "{PowerPointPlugin.windows_scripting}",
90 'type' => "flag",
91 'reqd' => "no" }
92 ];
93
[22861]94my $opt_office_args =
95 [ { 'name' => "convert_to",
96 'desc' => "{ConvertBinaryFile.convert_to}",
97 'type' => "enum",
98 'reqd' => "yes",
99 'list' => $openoffice_convert_to_list,
100 'deft' => "html" }
101 ];
102
[17722]103my $options = { 'name' => "PowerPointPlugin",
[17744]104 'desc' => "{PowerPointPlugin.desc}",
[6408]105 'abstract' => "no",
[11679]106 'inherits' => "yes",
[15114]107 'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
[6408]108 'args' => $arguments };
[4744]109
[2981]110sub new {
[10218]111 my ($class) = shift (@_);
112 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
113 push(@$pluginlist, $class);
[2981]114
[22861]115 # this bit needs to happen later after the arguments array has been
116 # finished - used for parsing the input args.
117 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
118 # this one needs to go in first, to get the print info in the right order
119 push(@{$hashArgOptLists->{"OptList"}},$options);
120
[10275]121 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[22515]122 push(@$arguments,@$opt_windows_args);
[10275]123 }
[22515]124
[22861]125 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
[10218]126
[22861]127 if ($auto_converter_self->{'openoffice_available'}) {
128 push (@$arguments,@$opt_office_args);
129 $openoffice_available = 1;
130 }
131 # TODO need to do the case where they are both enabled!!! what will the convert to list be???
[10427]132
[22861]133 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
134 foreach my $a (@$arguments) {
[22709]135 if ($a->{'name'} eq "process_exp") {
136 my $eval_expr = $a->{'deft'};
137 $a->{'deft'} = eval "$eval_expr";
[22861]138 last;
[22709]139 }
140 }
141
[22861]142 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143
144 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
145 my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
146
[10580]147 if ($self->{'info_only'}) {
148 # don't worry about any options etc
149 return bless $self, $class;
150 }
151
[22861]152 $self = bless $self, $class;
[15872]153 $self->{'file_type'} = "PPT";
154
[22597]155 if ($self->{'convert_to'} eq "auto") {
156 if ($self->{'windows_scripting'}) {
157 $self->{'convert_to'} = "pagedimg_jpg";
158 }
159 else {
160 $self->{'convert_to'} = "html";
161 }
162 }
163
164 my $outhandle = $self->{'outhandle'};
165
[22861]166 # can't have windows_scripting and openoffice_conversion at the same time
167 if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
168 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
[22515]169 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
[22861]170 $self->{'openoffice_conversion'} = 0;
[22515]171 }
172
[15872]173 #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10491]174 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
[22597]175
176 # set convert_to_plugin and convert_to_ext
[22640]177 $self->set_standard_convert_settings();
[22597]178
179 my $secondary_plugin_name = $self->{'convert_to_plugin'};
[10275]180 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10218]181
[22597]182 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
183 $secondary_plugin_options->{$secondary_plugin_name} = [];
[10275]184 }
[22597]185 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
186
187 push(@$specific_options, "-file_rename_method", "none");
188 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
189
190 if ($secondary_plugin_name eq "HTMLPlugin") {
191 push(@$specific_options, "-processing_tmp_files");
192 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
[18406]193 }
[22597]194 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
195 push(@$specific_options, "-processing_tmp_files");
[21958]196 #is this true??
[22597]197 push(@$specific_options,"-input_encoding", "utf8");
[22871]198 if ($self->{'openoffice_conversion'}) {
199 push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
200 }
[15903]201 }
[2981]202
[10275]203 $self = bless $self, $class;
204
[10427]205 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10275]206 return $self;
[2981]207}
208
209sub get_default_process_exp {
210 my $self = shift (@_);
[22709]211
[22861]212 if ($openoffice_available) {
[22709]213 return q^(?i)\.(ppt|pptx|odp)$^;
214 }
215
[2981]216 return q^(?i)\.ppt$^;
217}
[10275]218
[22861]219sub init {
220 my $self = shift (@_);
221
222 # ConvertBinaryFile init
223 $self->SUPER::init(@_);
224 $self->AutoLoadConverters::init();
225
226}
227
228sub begin {
229 my $self = shift (@_);
230
231 $self->AutoLoadConverters::begin();
232 $self->SUPER::begin(@_);
233
234}
235
236sub deinit {
237 my $self = shift (@_);
238
239 $self->AutoLoadConverters::deinit();
240 $self->SUPER::deinit(@_);
241
242}
243
244# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
245sub tmp_area_convert_file {
246 my $self = shift (@_);
247 my ($output_ext, $input_filename, $textref) = @_;
248
249 if ($self->{'openoffice_conversion'}) {
250 if ($self->{'convert_to'} eq "pagedimg") {
251 $output_ext = "html"; # first convert to html
252 }
253 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
254 if ($result == 0) {
255 my $outhandle=$self->{'outhandle'};
256 print $outhandle "OpenOfficeConverter Conversion error\n";
257 print $outhandle $result_str;
258 return "";
259
260 }
261 #print STDERR "result = $result\n";
262 if ($self->{'convert_to'} eq "pagedimg") {
[22871]263 my $item_filename = $self->generate_item_file($new_filename);
264 return $item_filename;
[22861]265 }
266 return $new_filename;
267
268 }
269 else {
270 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
271 }
272 # get tmp filename
273}
274
275# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
[22880]276sub read {
[22861]277 my $self = shift (@_);
278 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
279
280 # can we process this file??
281 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
282
283 return undef unless $self->can_process_this_file($filename_full_path);
284
[22880]285 # we are only doing something special for html_multi
286 if (!($self->{'openoffice_conversion'} && $self->{'convert_to'} eq "html_multi")) {
287 return $self->BasePlugin::read(@_);
288 }
289 my $outhandle = $self->{'outhandle'};
290 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
291 print $outhandle "$self->{'plugin_type'} processing $file\n"
292 if $self->{'verbosity'} > 1;
293
294 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path);
295 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
296 if (! -e "$conv_filename") {return -1;}
297
298 my ($tailname, $html_dirname, $suffix)
299 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
300
301 my $collect_file = &util::filename_within_collection($filename_full_path);
302 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
303 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
304
305 my @dir;
306 if (!opendir (DIR, $html_dirname)) {
307 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
308 # just process the original file
309 @dir = ("$tailname.$suffix");
310
311 } else {
312 @dir = readdir (DIR);
313 closedir (DIR);
314 }
315
316 foreach my $file (@dir) {
317 next unless $file =~ /\.html$/;
318
319 my ($rv, $doc_obj) =
320 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
321 if ((!defined $rv) || ($rv<1)) {
322 # wasn't processed
323 return $rv;
324 }
325
326 # next block copied from ConvertBinaryFile
327 # from here ...
328 # Override previous gsdlsourcefilename set by secondary plugin
329
330 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
331 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
332 # build. so set it manually.
333 $doc_obj->{'source_path'} = $filename_full_path;
334 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
335
336 $self->set_Source_metadata($doc_obj, $filename_no_path);
337
338 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
339 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
340
341
342 my ($tailname, $dirname, $suffix)
343 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
344 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
345
346
347 my $topsection = $doc_obj->get_top_section();
348 $self->add_associated_files($doc_obj, $filename_full_path);
349
350 # extra_metadata is already called by sec plugin in process??
351 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
352 # do any automatic metadata extraction
353 $self->auto_extract_metadata ($doc_obj);
354
355 # have we found a Title??
356 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
357
358 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
359 #$self->add_OID($doc_obj);
360 # to here...
361
362 # process it
363 $processor->process($doc_obj);
364 undef $doc_obj;
365 }
366 $self->{'num_processed'} ++;
367
368# my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[22861]369
[22880]370# if ((defined $process_status) && ($process_status == 1)) {
[22861]371
372 # process the document
[22880]373# $processor->process($doc_obj);
[22861]374
[22880]375# $self->{'num_processed'} ++;
376# undef $doc_obj;
377# }
[22861]378 # delete any temp files that we may have created
379 $self->clean_up_after_doc_obj_processing();
380
381
382 # if process_status == 1, then the file has been processed.
[22880]383 return 1;
[22861]384
385}
386
[22871]387sub generate_item_file
388{
389 my $self = shift(@_);
390 my ($input_filename) = @_;
391 my $outhandle = $self->{'outhandle'};
392 my ($tailname, $dirname, $suffix)
393 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
394
395 # find all the files in the directory
396 if (!opendir (DIR, $dirname)) {
397 print $outhandle "PowerPointPlugin: Couldn't read directory $dirname\n";
398 return $input_filename;
399 }
400
401 my @dir = readdir (DIR);
402 closedir (DIR);
403
404 # start the item file
405 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
406
407 # encoding specification????
408 if (!open (ITEMFILE, ">$itemfile_name")) {
409 print $outhandle "PowerPOintPlugin: Couldn't open $itemfile_name for writing\n";
410 }
411 print ITEMFILE "<GeneratedBy>PowerPointPlugin\n";
412 # print the first page
413 my @sorted_dir = sort alphanum_sort @dir;
414 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
415 my $file = $sorted_dir[$i];
416 if ($file =~ /^img(\d+)\.jpg$/) {
417 my $num = $1;
418 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
419 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
420 }
421 }
422 close ITEMFILE;
423 return $itemfile_name;
424
425
426}
427
428# want to sort img1, img2, ...img10, img11 etc.
429sub alphanum_sort {
430
431 my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
432 my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
433
434 if ($a_txt ne $b_txt) { return ($a cmp $b) };
435 return ($a_num <=> $b_num);
436}
437
438# Want to remove the line that links to first page, last page, next page, text etc.
439sub tidy_up_html {
440
441 my $self = shift(@_);
442 my ($filename) = @_;
443 return unless (-f $filename);
444 my $backup_filename = "$filename.bak";
445
446 &File::Copy::copy($filename, $backup_filename);
447
448 open (ORIGINAL, $backup_filename) || return;
449 open(HTMLFILE, ">$filename") || return;
450
451 my $line ="";
452 while ($line = <ORIGINAL>) {
453 if ($line =~ /\<body\>/) {
454 print HTMLFILE $line;
455 $line = <ORIGINAL>;
456 next if $line =~ /\<center\>/;
457 }
458 next if $line =~ /First page/;
459 print HTMLFILE ($line);
460 }
461
462 close HTMLFILE;
463 close ORIGINAL;
464}
[2981]4651;
[10275]466
Note: See TracBrowser for help on using the repository browser.