source: main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 4 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
Line 
1###########################################################################
2#
3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
4# (basic version supports versions 95 and 97)
5# (through OpenOffice extension, supports all contemporary formats)
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package PowerPointPlugin;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37use AutoLoadConverters;
38use ConvertBinaryFile;
39
40sub BEGIN {
41 @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
43
44my $openoffice_available = 0;
45
46my $windows_convert_to_list =
47 [ { 'name' => "auto",
48 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49 { 'name' => "html",
50 'desc' => "{ConvertBinaryFile.convert_to.html}" },
51 { 'name' => "text",
52 'desc' => "{ConvertBinaryFile.convert_to.text}" },
53 { 'name' => "pagedimg_jpg",
54 'desc' => "{PowerPointPlugin.convert_to.pagedimg_jpg}" },
55 { 'name' => "pagedimg_gif",
56 'desc' => "{PowerPointPlugin.convert_to.pagedimg_gif}" },
57 { 'name' => "pagedimg_png",
58 'desc' => "{PowerPointPlugin.convert_to.pagedimg_png}" }
59 ];
60
61my $openoffice_convert_to_list =
62 [ { 'name' => "auto",
63 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
64 { 'name' => "html_multi",
65 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
66 { 'name' => "text",
67 'desc' => "{ConvertBinaryFile.convert_to.text}" },
68 { 'name' => "pagedimg",
69 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70 ];
71
72my $openoffice_extra_convert_to_list =
73 [ { 'name' => "html_multi",
74 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
75 { 'name' => "pagedimg",
76 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
77 ];
78
79my $arguments =
80 [ { 'name' => "process_exp",
81 'desc' => "{BaseImporter.process_exp}",
82 'type' => "regexp",
83 'reqd' => "no",
84 'deft' => "&get_default_process_exp()", # delayed (see below)
85 }
86 ];
87
88my $opt_windows_args =
89 [ { 'name' => "convert_to",
90 'desc' => "{ConvertBinaryFile.convert_to}",
91 'type' => "enum",
92 'reqd' => "yes",
93 'list' => $windows_convert_to_list,
94 'deft' => "html" },
95 { 'name' => "windows_scripting",
96 'desc' => "{PowerPointPlugin.windows_scripting}",
97 'type' => "flag",
98 'reqd' => "no" }
99 ];
100
101my $opt_office_args =
102 [ { 'name' => "convert_to",
103 'desc' => "{ConvertBinaryFile.convert_to}",
104 'type' => "enum",
105 'reqd' => "yes",
106 'list' => $openoffice_convert_to_list,
107 'deft' => "html" }
108 ];
109
110my $options = { 'name' => "PowerPointPlugin",
111 'desc' => "{PowerPointPlugin.desc}",
112 'abstract' => "no",
113 'inherits' => "yes",
114 'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
115 'args' => $arguments };
116
117sub new {
118 my ($class) = shift (@_);
119 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
120 push(@$pluginlist, $class);
121
122 # this bit needs to happen later after the arguments array has been
123 # finished - used for parsing the input args.
124 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
125 # this one needs to go in first, to get the print info in the right order
126 push(@{$hashArgOptLists->{"OptList"}},$options);
127
128 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
129
130 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
131 if ($auto_converter_self->{'openoffice_available'}) {
132 # add openoffice convert_to options into list
133 push (@$windows_convert_to_list, @$openoffice_extra_convert_to_list);
134 $openoffice_available = 1;
135 }
136 push(@$arguments,@$opt_windows_args);
137 }
138 elsif ($auto_converter_self->{'openoffice_available'}) {
139 push (@$arguments,@$opt_office_args);
140 $openoffice_available = 1;
141 }
142 # TODO need to do the case where they are both enabled!!! what will the convert to list be???
143
144 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
145 foreach my $a (@$arguments) {
146 if ($a->{'name'} eq "process_exp") {
147 my $eval_expr = $a->{'deft'};
148 $a->{'deft'} = eval "$eval_expr";
149 last;
150 }
151 }
152
153 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
154
155 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
156 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
157
158 if ($self->{'info_only'}) {
159 # don't worry about any options etc
160 return bless $self, $class;
161 }
162
163 $self = bless $self, $class;
164 $self->{'file_type'} = "PPT";
165
166 if ($self->{'convert_to'} eq "auto") {
167 if ($self->{'windows_scripting'}) {
168 $self->{'convert_to'} = "pagedimg_jpg";
169 }
170 else {
171 $self->{'convert_to'} = "html";
172 }
173 }
174
175 my $outhandle = $self->{'outhandle'};
176
177 # can't have windows_scripting and openoffice_conversion at the same time
178 if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
179 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
180 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
181 $self->{'openoffice_conversion'} = 0;
182 }
183
184 #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
185 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
186
187 # set convert_to_plugin and convert_to_ext
188 $self->set_standard_convert_settings();
189
190 my $secondary_plugin_name = $self->{'convert_to_plugin'};
191 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
192
193 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
194 $secondary_plugin_options->{$secondary_plugin_name} = [];
195 }
196 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
197
198 push(@$specific_options, "-file_rename_method", "none");
199 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
200
201 if ($secondary_plugin_name eq "HTMLPlugin") {
202 push(@$specific_options, "-processing_tmp_files");
203 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
204 }
205 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
206 push(@$specific_options, "-processing_tmp_files");
207 #is this true??
208 push(@$specific_options,"-input_encoding", "utf8");
209 if ($self->{'openoffice_conversion'}) {
210 push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
211 }
212 }
213
214 $self = bless $self, $class;
215
216 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
217 return $self;
218}
219
220sub get_default_process_exp {
221 my $self = shift (@_);
222
223 if ($openoffice_available) {
224 return q^(?i)\.(ppt|pptx|odp)$^;
225 }
226
227 return q^(?i)\.ppt$^;
228}
229
230sub init {
231 my $self = shift (@_);
232
233 # ConvertBinaryFile init
234 $self->SUPER::init(@_);
235 $self->AutoLoadConverters::init(@_);
236
237}
238
239sub begin {
240 my $self = shift (@_);
241
242 $self->AutoLoadConverters::begin(@_);
243 $self->SUPER::begin(@_);
244
245}
246
247sub deinit {
248 my $self = shift (@_);
249
250 $self->AutoLoadConverters::deinit(@_);
251 $self->SUPER::deinit(@_);
252
253}
254
255# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
256sub tmp_area_convert_file {
257 my $self = shift (@_);
258 my ($output_ext, $input_filename, $textref) = @_;
259
260 if ($self->{'openoffice_conversion'}) {
261 if ($self->{'convert_to'} eq "pagedimg") {
262 $output_ext = "html"; # first convert to html
263 }
264 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
265 if ($result == 0) {
266 my $outhandle=$self->{'outhandle'};
267 print $outhandle "OpenOfficeConverter Conversion error\n";
268 print $outhandle $result_str;
269 return "";
270
271 }
272 #print STDERR "result = $result\n";
273 if ($self->{'convert_to'} eq "pagedimg") {
274 my $item_filename = $self->generate_item_file($new_filename);
275 return $item_filename;
276 }
277 return $new_filename;
278
279 }
280 else {
281 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
282 }
283 # get tmp filename
284}
285
286# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
287sub read {
288 my $self = shift (@_);
289 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
290
291 # can we process this file??
292 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
293
294 return undef unless $self->can_process_this_file($filename_full_path);
295
296 # we are only doing something special for html_multi
297 if (!($self->{'openoffice_conversion'} && $self->{'convert_to'} eq "html_multi")) {
298 return $self->BaseImporter::read(@_);
299 }
300 my $outhandle = $self->{'outhandle'};
301 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
302 print $outhandle "$self->{'plugin_type'} processing $file\n"
303 if $self->{'verbosity'} > 1;
304
305 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path);
306 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
307 if (! -e "$conv_filename") {return -1;}
308
309 my ($tailname, $html_dirname, $suffix)
310 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
311
312 my $collect_file = &util::filename_within_collection($filename_full_path);
313 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
314 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
315
316 my @dir;
317 if (!opendir (DIR, $html_dirname)) {
318 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
319 # just process the original file
320 @dir = ("$tailname.$suffix");
321
322 } else {
323 @dir = readdir (DIR);
324 closedir (DIR);
325 }
326
327 foreach my $file (@dir) {
328 next unless $file =~ /\.html$/;
329
330 my ($rv, $doc_obj) =
331 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
332 if ((!defined $rv) || ($rv<1)) {
333 # wasn't processed
334 return $rv;
335 }
336
337 # next block copied from ConvertBinaryFile
338 # from here ...
339 # Override previous gsdlsourcefilename set by secondary plugin
340
341 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
342 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
343 # build. so set it manually.
344 $doc_obj->set_source_path($filename_full_path);
345 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
346
347 my $plugin_filename_encoding = $self->{'filename_encoding'};
348 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
349 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
350
351 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
352 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
353
354
355 my ($tailname, $dirname, $suffix)
356 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
357 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
358
359
360 my $topsection = $doc_obj->get_top_section();
361 $self->add_associated_files($doc_obj, $filename_full_path);
362
363 # extra_metadata is already called by sec plugin in process??
364 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
365 # do any automatic metadata extraction
366 $self->auto_extract_metadata ($doc_obj);
367
368 # have we found a Title??
369 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
370
371 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
372 #$self->add_OID($doc_obj);
373 # to here...
374
375 # process it
376 $processor->process($doc_obj);
377 undef $doc_obj;
378 }
379 $self->{'num_processed'} ++;
380
381# my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
382
383# if ((defined $process_status) && ($process_status == 1)) {
384
385 # process the document
386# $processor->process($doc_obj);
387
388# $self->{'num_processed'} ++;
389# undef $doc_obj;
390# }
391 # delete any temp files that we may have created
392 $self->clean_up_after_doc_obj_processing();
393
394
395 # if process_status == 1, then the file has been processed.
396 return 1;
397
398}
399
400sub generate_item_file
401{
402 my $self = shift(@_);
403 my ($input_filename) = @_;
404 my $outhandle = $self->{'outhandle'};
405 my ($tailname, $dirname, $suffix)
406 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
407
408 # find all the files in the directory
409 if (!opendir (DIR, $dirname)) {
410 print $outhandle "PowerPointPlugin: Couldn't read directory $dirname\n";
411 return $input_filename;
412 }
413
414 my @dir = readdir (DIR);
415 closedir (DIR);
416
417 # start the item file
418 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
419
420 # encoding specification????
421 if (!open (ITEMFILE, ">$itemfile_name")) {
422 print $outhandle "PowerPOintPlugin: Couldn't open $itemfile_name for writing\n";
423 }
424 print ITEMFILE "<GeneratedBy>PowerPointPlugin\n";
425 # print the first page
426 my @sorted_dir = sort alphanum_sort @dir;
427 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
428 my $file = $sorted_dir[$i];
429 if ($file =~ /^img(\d+)\.jpg$/) {
430 my $num = $1;
431 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
432 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
433 }
434 }
435 close ITEMFILE;
436 return $itemfile_name;
437
438
439}
440
441# want to sort img1, img2, ...img10, img11 etc.
442sub alphanum_sort {
443
444 my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
445 my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
446
447 if ($a_txt ne $b_txt) { return ($a cmp $b) };
448 return ($a_num <=> $b_num);
449}
450
451# Want to remove the line that links to first page, last page, next page, text etc.
452sub tidy_up_html {
453
454 my $self = shift(@_);
455 my ($filename) = @_;
456 return unless (-f $filename);
457 my $backup_filename = "$filename.bak";
458
459 &File::Copy::copy($filename, $backup_filename);
460
461 open (ORIGINAL, $backup_filename) || return;
462 open(HTMLFILE, ">$filename") || return;
463
464 my $line ="";
465 while ($line = <ORIGINAL>) {
466 if ($line =~ /\<body\>/) {
467 print HTMLFILE $line;
468 $line = <ORIGINAL>;
469 next if $line =~ /\<center\>/;
470 }
471 next if $line =~ /First page/;
472 print HTMLFILE ($line);
473 }
474
475 close HTMLFILE;
476 close ORIGINAL;
477}
4781;
479
Note: See TracBrowser for help on using the repository browser.