source: main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

Last change on this file was 34997, checked in by davidb, 3 years ago

When working with orthogonal indexes, these plugins constructors get called a second time; however the way the eval expression was written resulted in an error. Change is to test to see if the eval_expr is still in the form of '&...' and if it is, let the eval go ahead. Otherwise (i.e. second time in constructor) it has already been evaluated and restored under the 'deft' name, in which case no futher work needs to be done

  • Property svn:keywords set to Author Date Id Revision
File size: 14.0 KB
Line 
1###########################################################################
2#
3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
4# (basic version supports versions 95 and 97)
5# (through OpenOffice extension, supports all contemporary formats)
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package PowerPointPlugin;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37use AutoLoadConverters;
38use ConvertBinaryFile;
39
40sub BEGIN {
41 @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
43
44my $openoffice_available = 0;
45
46my $windows_convert_to_list =
47 [ { 'name' => "auto",
48 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49 { 'name' => "html",
50 'desc' => "{ConvertBinaryFile.convert_to.html}" },
51 { 'name' => "text",
52 'desc' => "{ConvertBinaryFile.convert_to.text}" },
53 { 'name' => "pagedimg_jpg",
54 'desc' => "{PowerPointPlugin.convert_to.pagedimg_jpg}" },
55 { 'name' => "pagedimg_gif",
56 'desc' => "{PowerPointPlugin.convert_to.pagedimg_gif}" },
57 { 'name' => "pagedimg_png",
58 'desc' => "{PowerPointPlugin.convert_to.pagedimg_png}" }
59 ];
60
61my $openoffice_convert_to_list =
62 [ { 'name' => "auto",
63 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
64 { 'name' => "html_multi",
65 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
66 { 'name' => "text",
67 'desc' => "{ConvertBinaryFile.convert_to.text}" },
68 { 'name' => "pagedimg",
69 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70 ];
71
72my $openoffice_extra_convert_to_list =
73 [ { 'name' => "html_multi",
74 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
75 { 'name' => "pagedimg",
76 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
77 ];
78
79my $arguments =
80 [ { 'name' => "process_exp",
81 'desc' => "{BaseImporter.process_exp}",
82 'type' => "regexp",
83 'reqd' => "no",
84 'deft' => "&get_default_process_exp()", # delayed (see below)
85 }
86 ];
87
88my $opt_windows_args =
89 [ { 'name' => "convert_to",
90 'desc' => "{ConvertBinaryFile.convert_to}",
91 'type' => "enum",
92 'reqd' => "yes",
93 'list' => $windows_convert_to_list,
94 'deft' => "html" },
95 { 'name' => "windows_scripting",
96 'desc' => "{PowerPointPlugin.windows_scripting}",
97 'type' => "flag",
98 'reqd' => "no" }
99 ];
100
101my $opt_office_args =
102 [ { 'name' => "convert_to",
103 'desc' => "{ConvertBinaryFile.convert_to}",
104 'type' => "enum",
105 'reqd' => "yes",
106 'list' => $openoffice_convert_to_list,
107 'deft' => "html" }
108 ];
109
110my $options = { 'name' => "PowerPointPlugin",
111 'desc' => "{PowerPointPlugin.desc}",
112 'abstract' => "no",
113 'inherits' => "yes",
114 'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
115 'args' => $arguments };
116
117sub new {
118 my ($class) = shift (@_);
119 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
120 push(@$pluginlist, $class);
121
122 # this bit needs to happen later after the arguments array has been
123 # finished - used for parsing the input args.
124 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
125 # this one needs to go in first, to get the print info in the right order
126 push(@{$hashArgOptLists->{"OptList"}},$options);
127
128 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
129
130 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
131 if ($auto_converter_self->{'openoffice_available'}) {
132 # add openoffice convert_to options into list
133 push (@$windows_convert_to_list, @$openoffice_extra_convert_to_list);
134 $openoffice_available = 1;
135 }
136 push(@$arguments,@$opt_windows_args);
137 }
138 elsif ($auto_converter_self->{'openoffice_available'}) {
139 push (@$arguments,@$opt_office_args);
140 $openoffice_available = 1;
141 }
142 # TODO need to do the case where they are both enabled!!! what will the convert to list be???
143
144 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
145 foreach my $a (@$arguments) {
146 if ($a->{'name'} eq "process_exp") {
147 my $eval_expr = $a->{'deft'};
148 if ($eval_expr =~ m/^&/) {
149 $a->{'deft'} = eval "$eval_expr";
150 }
151 last;
152 }
153 }
154
155 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
156
157 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
158 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
159
160 if ($self->{'info_only'}) {
161 # don't worry about any options etc
162 return bless $self, $class;
163 }
164
165 $self = bless $self, $class;
166 $self->{'file_type'} = "PPT";
167
168 if ($self->{'convert_to'} eq "auto") {
169 if ($self->{'windows_scripting'}) {
170 $self->{'convert_to'} = "pagedimg_jpg";
171 }
172 else {
173 $self->{'convert_to'} = "html";
174 }
175 }
176
177 my $outhandle = $self->{'outhandle'};
178
179 # can't have windows_scripting and openoffice_conversion at the same time
180 if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
181 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
182 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
183 $self->{'openoffice_conversion'} = 0;
184 }
185
186 #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
187 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
188
189 # set convert_to_plugin and convert_to_ext
190 $self->set_standard_convert_settings();
191
192 my $secondary_plugin_name = $self->{'convert_to_plugin'};
193 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
194
195 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
196 $secondary_plugin_options->{$secondary_plugin_name} = [];
197 }
198 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
199
200 push(@$specific_options, "-file_rename_method", "none");
201 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
202
203 if ($secondary_plugin_name eq "HTMLPlugin") {
204 push(@$specific_options, "-processing_tmp_files");
205 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
206 }
207 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
208 push(@$specific_options, "-processing_tmp_files");
209 #is this true??
210 push(@$specific_options,"-input_encoding", "utf8");
211 if ($self->{'openoffice_conversion'}) {
212 push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
213 }
214 }
215
216 $self = bless $self, $class;
217
218 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
219 return $self;
220}
221
222sub get_default_process_exp {
223 my $self = shift (@_);
224
225 if ($openoffice_available) {
226 return q^(?i)\.(ppt|pptx|odp)$^;
227 }
228
229 return q^(?i)\.ppt$^;
230}
231
232sub init {
233 my $self = shift (@_);
234
235 # ConvertBinaryFile init
236 $self->SUPER::init(@_);
237 $self->AutoLoadConverters::init(@_);
238
239}
240
241sub begin {
242 my $self = shift (@_);
243
244 $self->AutoLoadConverters::begin(@_);
245 $self->SUPER::begin(@_);
246
247}
248
249sub deinit {
250 my $self = shift (@_);
251
252 $self->AutoLoadConverters::deinit(@_);
253 $self->SUPER::deinit(@_);
254
255}
256
257# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
258sub tmp_area_convert_file {
259 my $self = shift (@_);
260 my ($output_ext, $input_filename, $textref) = @_;
261
262 if ($self->{'openoffice_conversion'}) {
263 if ($self->{'convert_to'} eq "pagedimg") {
264 $output_ext = "html"; # first convert to html
265 }
266 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
267 if ($result == 0) {
268 my $outhandle=$self->{'outhandle'};
269 print $outhandle "OpenOfficeConverter Conversion error\n";
270 print $outhandle $result_str;
271 return "";
272
273 }
274 #print STDERR "result = $result\n";
275 if ($self->{'convert_to'} eq "pagedimg") {
276 my $item_filename = $self->generate_item_file($new_filename);
277 return $item_filename;
278 }
279 return $new_filename;
280
281 }
282 else {
283 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
284 }
285 # get tmp filename
286}
287
288# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
289sub read {
290 my $self = shift (@_);
291 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
292
293 # can we process this file??
294 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
295
296 return undef unless $self->can_process_this_file($filename_full_path);
297
298 # we are only doing something special for html_multi
299 if (!($self->{'openoffice_conversion'} && $self->{'convert_to'} eq "html_multi")) {
300 return $self->BaseImporter::read(@_);
301 }
302 my $outhandle = $self->{'outhandle'};
303 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
304 print $outhandle "$self->{'plugin_type'} processing $file\n"
305 if $self->{'verbosity'} > 1;
306
307 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path);
308 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
309 if (! -e "$conv_filename") {return -1;}
310
311 my ($tailname, $html_dirname, $suffix)
312 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
313
314 my $collect_file = &util::filename_within_collection($filename_full_path);
315 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
316 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
317
318 my @dir;
319 if (!opendir (DIR, $html_dirname)) {
320 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
321 # just process the original file
322 @dir = ("$tailname.$suffix");
323
324 } else {
325 @dir = readdir (DIR);
326 closedir (DIR);
327 }
328
329 foreach my $file (@dir) {
330 next unless $file =~ /\.html$/;
331
332 my ($rv, $doc_obj) =
333 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
334 if ((!defined $rv) || ($rv<1)) {
335 # wasn't processed
336 return $rv;
337 }
338
339 # next block copied from ConvertBinaryFile
340 # from here ...
341 # Override previous gsdlsourcefilename set by secondary plugin
342
343 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
344 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
345 # build. so set it manually.
346 $doc_obj->set_source_path($filename_full_path);
347 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
348
349 my $plugin_filename_encoding = $self->{'filename_encoding'};
350 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
351 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
352
353 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
354 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
355
356
357 my ($tailname, $dirname, $suffix)
358 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
359 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
360
361
362 my $topsection = $doc_obj->get_top_section();
363 $self->add_associated_files($doc_obj, $filename_full_path);
364
365 # extra_metadata is already called by sec plugin in process??
366 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
367 # do any automatic metadata extraction
368 $self->auto_extract_metadata ($doc_obj);
369
370 # have we found a Title??
371 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
372
373 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
374 #$self->add_OID($doc_obj);
375 # to here...
376
377 # process it
378 $processor->process($doc_obj);
379 undef $doc_obj;
380 }
381 $self->{'num_processed'} ++;
382
383# my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
384
385# if ((defined $process_status) && ($process_status == 1)) {
386
387 # process the document
388# $processor->process($doc_obj);
389
390# $self->{'num_processed'} ++;
391# undef $doc_obj;
392# }
393 # delete any temp files that we may have created
394 $self->clean_up_after_doc_obj_processing();
395
396
397 # if process_status == 1, then the file has been processed.
398 return 1;
399
400}
401
402# want to sort img1, img2, ...img10, img11 etc.
403sub alphanum_sort {
404
405 my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
406 my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
407
408 if ($a_txt ne $b_txt) { return ($a cmp $b) };
409 return ($a_num <=> $b_num);
410}
411
412# Want to remove the line that links to first page, last page, next page, text etc.
413sub tidy_up_html {
414
415 my $self = shift(@_);
416 my ($filename) = @_;
417 return unless (-f $filename);
418 my $backup_filename = "$filename.bak";
419
420 &File::Copy::copy($filename, $backup_filename);
421
422 open (ORIGINAL, $backup_filename) || return;
423 open(HTMLFILE, ">$filename") || return;
424
425 my $line ="";
426 while ($line = <ORIGINAL>) {
427 if ($line =~ /\<body\>/) {
428 print HTMLFILE $line;
429 $line = <ORIGINAL>;
430 next if $line =~ /\<center\>/;
431 }
432 next if $line =~ /First page/;
433 print HTMLFILE ($line);
434 }
435
436 close HTMLFILE;
437 close ORIGINAL;
438}
4391;
440
Note: See TracBrowser for help on using the repository browser.