source: main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm@ 22871

Last change on this file since 22871 was 22871, checked in by kjdon, 14 years ago

added code to generate an item file if asked for pagedimg output with openoffice. Have to tidy up the generated HTML files so remove unnecessary links

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
RevLine 
[2981]1###########################################################################
2#
[17722]3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
[22709]4# (basic version supports versions 95 and 97)
[22861]5# (through OpenOffice extension, supports all contemporary formats)
[2981]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
[17722]29package PowerPointPlugin;
[2981]30
[10254]31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
[22515]33no strict 'subs';
[22705]34
[22515]35use gsprintf 'gsprintf';
[2981]36
[22861]37use AutoLoadConverters;
38use ConvertBinaryFile;
[2981]39
[22861]40sub BEGIN {
41 @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
[22705]43
[22861]44my $openoffice_available = 0;
[22705]45
[22515]46my $windows_convert_to_list =
[10466]47 [ { 'name' => "auto",
[15872]48 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10466]49 { 'name' => "html",
[15872]50 'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10466]51 { 'name' => "text",
[15872]52 'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10466]53 { 'name' => "pagedimg_jpg",
[15872]54 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
[10466]55 { 'name' => "pagedimg_gif",
[15872]56 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
[10466]57 { 'name' => "pagedimg_png",
[15872]58 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
[10466]59 ];
60
[22861]61my $openoffice_convert_to_list =
62 [ { 'name' => "auto",
63 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
64 { 'name' => "html",
65 'desc' => "{PowerPointPlugin.convert_to.oo_html}" },
66 { 'name' => "text",
67 'desc' => "{ConvertBinaryFile.convert_to.text}" },
68 { 'name' => "pagedimg",
69 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70 ];
71
[6408]72my $arguments =
73 [ { 'name' => "process_exp",
[15872]74 'desc' => "{BasePlugin.process_exp}",
[6408]75 'type' => "regexp",
76 'reqd' => "no",
[22861]77 'deft' => "&get_default_process_exp()", # delayed (see below)
[22709]78 }
[6408]79 ];
80
[22515]81my $opt_windows_args =
82 [ { 'name' => "convert_to",
83 'desc' => "{ConvertBinaryFile.convert_to}",
84 'type' => "enum",
85 'reqd' => "yes",
86 'list' => $windows_convert_to_list,
87 'deft' => "html" },
88 { 'name' => "windows_scripting",
89 'desc' => "{PowerPointPlugin.windows_scripting}",
90 'type' => "flag",
91 'reqd' => "no" }
92 ];
93
[22861]94my $opt_office_args =
95 [ { 'name' => "convert_to",
96 'desc' => "{ConvertBinaryFile.convert_to}",
97 'type' => "enum",
98 'reqd' => "yes",
99 'list' => $openoffice_convert_to_list,
100 'deft' => "html" }
101 ];
102
[17722]103my $options = { 'name' => "PowerPointPlugin",
[17744]104 'desc' => "{PowerPointPlugin.desc}",
[6408]105 'abstract' => "no",
[11679]106 'inherits' => "yes",
[15114]107 'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
[6408]108 'args' => $arguments };
[4744]109
[2981]110sub new {
[10218]111 my ($class) = shift (@_);
112 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
113 push(@$pluginlist, $class);
[2981]114
[22861]115 # this bit needs to happen later after the arguments array has been
116 # finished - used for parsing the input args.
117 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
118 # this one needs to go in first, to get the print info in the right order
119 push(@{$hashArgOptLists->{"OptList"}},$options);
120
[10275]121 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[22515]122 push(@$arguments,@$opt_windows_args);
[10275]123 }
[22515]124
[22861]125 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
[10218]126
[22861]127 if ($auto_converter_self->{'openoffice_available'}) {
128 push (@$arguments,@$opt_office_args);
129 $openoffice_available = 1;
130 }
131 # TODO need to do the case where they are both enabled!!! what will the convert to list be???
[10427]132
[22861]133 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
134 foreach my $a (@$arguments) {
[22709]135 if ($a->{'name'} eq "process_exp") {
136 my $eval_expr = $a->{'deft'};
137 $a->{'deft'} = eval "$eval_expr";
[22861]138 last;
[22709]139 }
140 }
141
[22861]142 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143
144 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
145 my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
146
[10580]147 if ($self->{'info_only'}) {
148 # don't worry about any options etc
149 return bless $self, $class;
150 }
151
[22861]152 $self = bless $self, $class;
[15872]153 $self->{'filename_extension'} = "ppt";
154 $self->{'file_type'} = "PPT";
155
[22597]156 if ($self->{'convert_to'} eq "auto") {
157 if ($self->{'windows_scripting'}) {
158 $self->{'convert_to'} = "pagedimg_jpg";
159 }
160 else {
161 $self->{'convert_to'} = "html";
162 }
163 }
164
165 my $outhandle = $self->{'outhandle'};
166
[22861]167 # can't have windows_scripting and openoffice_conversion at the same time
168 if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
169 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
[22515]170 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
[22861]171 $self->{'openoffice_conversion'} = 0;
[22515]172 }
173
[15872]174 #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10491]175 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
[22597]176
177 # set convert_to_plugin and convert_to_ext
[22640]178 $self->set_standard_convert_settings();
[22597]179
180 my $secondary_plugin_name = $self->{'convert_to_plugin'};
[10275]181 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10218]182
[22597]183 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
184 $secondary_plugin_options->{$secondary_plugin_name} = [];
[10275]185 }
[22597]186 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
187
188 push(@$specific_options, "-file_rename_method", "none");
189 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
190
191 if ($secondary_plugin_name eq "HTMLPlugin") {
192 push(@$specific_options, "-processing_tmp_files");
193 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
[18406]194 }
[22597]195 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
196 push(@$specific_options, "-processing_tmp_files");
[21958]197 #is this true??
[22597]198 push(@$specific_options,"-input_encoding", "utf8");
[22871]199 if ($self->{'openoffice_conversion'}) {
200 push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
201 }
[15903]202 }
[2981]203
[10275]204 $self = bless $self, $class;
205
[10427]206 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10275]207 return $self;
[2981]208}
209
210sub get_default_process_exp {
211 my $self = shift (@_);
[22709]212
[22861]213 if ($openoffice_available) {
[22709]214 return q^(?i)\.(ppt|pptx|odp)$^;
215 }
216
[2981]217 return q^(?i)\.ppt$^;
218}
[10275]219
[22861]220sub init {
221 my $self = shift (@_);
222
223 # ConvertBinaryFile init
224 $self->SUPER::init(@_);
225 $self->AutoLoadConverters::init();
226
227}
228
229sub begin {
230 my $self = shift (@_);
231
232 $self->AutoLoadConverters::begin();
233 $self->SUPER::begin(@_);
234
235}
236
237sub deinit {
238 my $self = shift (@_);
239
240 $self->AutoLoadConverters::deinit();
241 $self->SUPER::deinit(@_);
242
243}
244
245# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
246sub tmp_area_convert_file {
247 my $self = shift (@_);
248 my ($output_ext, $input_filename, $textref) = @_;
249
250 if ($self->{'openoffice_conversion'}) {
251 if ($self->{'convert_to'} eq "pagedimg") {
252 $output_ext = "html"; # first convert to html
253 }
254 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
255 if ($result == 0) {
256 my $outhandle=$self->{'outhandle'};
257 print $outhandle "OpenOfficeConverter Conversion error\n";
258 print $outhandle $result_str;
259 return "";
260
261 }
262 #print STDERR "result = $result\n";
263 if ($self->{'convert_to'} eq "pagedimg") {
[22871]264 my $item_filename = $self->generate_item_file($new_filename);
265 return $item_filename;
[22861]266 }
267 return $new_filename;
268
269 }
270 else {
271 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
272 }
273 # get tmp filename
274}
275
276# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
277sub read_XX {
278 my $self = shift (@_);
279 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
280
281 # can we process this file??
282 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
283
284 return undef unless $self->can_process_this_file($filename_full_path);
285
286 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
287
288 if ((defined $process_status) && ($process_status == 1)) {
289
290 # process the document
291 $processor->process($doc_obj);
292
293 $self->{'num_processed'} ++;
294 undef $doc_obj;
295 }
296 # delete any temp files that we may have created
297 $self->clean_up_after_doc_obj_processing();
298
299
300 # if process_status == 1, then the file has been processed.
301 return $process_status;
302
303}
304
[22871]305sub generate_item_file
306{
307 my $self = shift(@_);
308 my ($input_filename) = @_;
309 my $outhandle = $self->{'outhandle'};
310 my ($tailname, $dirname, $suffix)
311 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
312
313 # find all the files in the directory
314 if (!opendir (DIR, $dirname)) {
315 print $outhandle "PowerPointPlugin: Couldn't read directory $dirname\n";
316 return $input_filename;
317 }
318
319 my @dir = readdir (DIR);
320 closedir (DIR);
321
322 # start the item file
323 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
324
325 # encoding specification????
326 if (!open (ITEMFILE, ">$itemfile_name")) {
327 print $outhandle "PowerPOintPlugin: Couldn't open $itemfile_name for writing\n";
328 }
329 print ITEMFILE "<GeneratedBy>PowerPointPlugin\n";
330 # print the first page
331 my @sorted_dir = sort alphanum_sort @dir;
332 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
333 my $file = $sorted_dir[$i];
334 if ($file =~ /^img(\d+)\.jpg$/) {
335 my $num = $1;
336 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
337 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
338 }
339 }
340 close ITEMFILE;
341 return $itemfile_name;
342
343
344}
345
346# want to sort img1, img2, ...img10, img11 etc.
347sub alphanum_sort {
348
349 my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
350 my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
351
352 if ($a_txt ne $b_txt) { return ($a cmp $b) };
353 return ($a_num <=> $b_num);
354}
355
356# Want to remove the line that links to first page, last page, next page, text etc.
357sub tidy_up_html {
358
359 my $self = shift(@_);
360 my ($filename) = @_;
361 return unless (-f $filename);
362 my $backup_filename = "$filename.bak";
363
364 &File::Copy::copy($filename, $backup_filename);
365
366 open (ORIGINAL, $backup_filename) || return;
367 open(HTMLFILE, ">$filename") || return;
368
369 my $line ="";
370 while ($line = <ORIGINAL>) {
371 if ($line =~ /\<body\>/) {
372 print HTMLFILE $line;
373 $line = <ORIGINAL>;
374 next if $line =~ /\<center\>/;
375 }
376 next if $line =~ /First page/;
377 print HTMLFILE ($line);
378 }
379
380 close HTMLFILE;
381 close ORIGINAL;
382}
[2981]3831;
[10275]384
Note: See TracBrowser for help on using the repository browser.