source: main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago

Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1###########################################################################
2#
3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
4# (basic version supports versions 95 and 97)
5# (through OpenOffice extension, supports all contemporary formats)
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package PowerPointPlugin;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37use AutoLoadConverters;
38use ConvertBinaryFile;
39
40sub BEGIN {
41 @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
43
44my $openoffice_available = 0;
45
46my $windows_convert_to_list =
47 [ { 'name' => "auto",
48 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49 { 'name' => "html",
50 'desc' => "{ConvertBinaryFile.convert_to.html}" },
51 { 'name' => "text",
52 'desc' => "{ConvertBinaryFile.convert_to.text}" },
53 { 'name' => "pagedimg_jpg",
54 'desc' => "{PowerPointPlugin.convert_to.pagedimg_jpg}" },
55 { 'name' => "pagedimg_gif",
56 'desc' => "{PowerPointPlugin.convert_to.pagedimg_gif}" },
57 { 'name' => "pagedimg_png",
58 'desc' => "{PowerPointPlugin.convert_to.pagedimg_png}" }
59 ];
60
61my $openoffice_convert_to_list =
62 [ { 'name' => "auto",
63 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
64 { 'name' => "html_multi",
65 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
66 { 'name' => "text",
67 'desc' => "{ConvertBinaryFile.convert_to.text}" },
68 { 'name' => "pagedimg",
69 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70 ];
71
72my $openoffice_extra_convert_to_list =
73 [ { 'name' => "html_multi",
74 'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
75 { 'name' => "pagedimg",
76 'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
77 ];
78
79my $arguments =
80 [ { 'name' => "process_exp",
81 'desc' => "{BasePlugin.process_exp}",
82 'type' => "regexp",
83 'reqd' => "no",
84 'deft' => "&get_default_process_exp()", # delayed (see below)
85 }
86 ];
87
88my $opt_windows_args =
89 [ { 'name' => "convert_to",
90 'desc' => "{ConvertBinaryFile.convert_to}",
91 'type' => "enum",
92 'reqd' => "yes",
93 'list' => $windows_convert_to_list,
94 'deft' => "html" },
95 { 'name' => "windows_scripting",
96 'desc' => "{PowerPointPlugin.windows_scripting}",
97 'type' => "flag",
98 'reqd' => "no" }
99 ];
100
101my $opt_office_args =
102 [ { 'name' => "convert_to",
103 'desc' => "{ConvertBinaryFile.convert_to}",
104 'type' => "enum",
105 'reqd' => "yes",
106 'list' => $openoffice_convert_to_list,
107 'deft' => "html" }
108 ];
109
110my $options = { 'name' => "PowerPointPlugin",
111 'desc' => "{PowerPointPlugin.desc}",
112 'abstract' => "no",
113 'inherits' => "yes",
114 'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
115 'args' => $arguments };
116
117sub new {
118 my ($class) = shift (@_);
119 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
120 push(@$pluginlist, $class);
121
122 # this bit needs to happen later after the arguments array has been
123 # finished - used for parsing the input args.
124 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
125 # this one needs to go in first, to get the print info in the right order
126 push(@{$hashArgOptLists->{"OptList"}},$options);
127
128 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
129
130 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
131 if ($auto_converter_self->{'openoffice_available'}) {
132 # add openoffice convert_to options into list
133 push (@$windows_convert_to_list, @$openoffice_extra_convert_to_list);
134 }
135 push(@$arguments,@$opt_windows_args);
136 }
137 elsif ($auto_converter_self->{'openoffice_available'}) {
138 push (@$arguments,@$opt_office_args);
139 $openoffice_available = 1;
140 }
141 # TODO need to do the case where they are both enabled!!! what will the convert to list be???
142
143 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
144 foreach my $a (@$arguments) {
145 if ($a->{'name'} eq "process_exp") {
146 my $eval_expr = $a->{'deft'};
147 $a->{'deft'} = eval "$eval_expr";
148 last;
149 }
150 }
151
152 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
153
154 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
155 my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
156
157 if ($self->{'info_only'}) {
158 # don't worry about any options etc
159 return bless $self, $class;
160 }
161
162 $self = bless $self, $class;
163 $self->{'file_type'} = "PPT";
164
165 if ($self->{'convert_to'} eq "auto") {
166 if ($self->{'windows_scripting'}) {
167 $self->{'convert_to'} = "pagedimg_jpg";
168 }
169 else {
170 $self->{'convert_to'} = "html";
171 }
172 }
173
174 my $outhandle = $self->{'outhandle'};
175
176 # can't have windows_scripting and openoffice_conversion at the same time
177 if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
178 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
179 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
180 $self->{'openoffice_conversion'} = 0;
181 }
182
183 #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
184 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
185
186 # set convert_to_plugin and convert_to_ext
187 $self->set_standard_convert_settings();
188
189 my $secondary_plugin_name = $self->{'convert_to_plugin'};
190 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
191
192 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
193 $secondary_plugin_options->{$secondary_plugin_name} = [];
194 }
195 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
196
197 push(@$specific_options, "-file_rename_method", "none");
198 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
199
200 if ($secondary_plugin_name eq "HTMLPlugin") {
201 push(@$specific_options, "-processing_tmp_files");
202 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
203 }
204 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
205 push(@$specific_options, "-processing_tmp_files");
206 #is this true??
207 push(@$specific_options,"-input_encoding", "utf8");
208 if ($self->{'openoffice_conversion'}) {
209 push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
210 }
211 }
212
213 $self = bless $self, $class;
214
215 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
216 return $self;
217}
218
219sub get_default_process_exp {
220 my $self = shift (@_);
221
222 if ($openoffice_available) {
223 return q^(?i)\.(ppt|pptx|odp)$^;
224 }
225
226 return q^(?i)\.ppt$^;
227}
228
229sub init {
230 my $self = shift (@_);
231
232 # ConvertBinaryFile init
233 $self->SUPER::init(@_);
234 $self->AutoLoadConverters::init();
235
236}
237
238sub begin {
239 my $self = shift (@_);
240
241 $self->AutoLoadConverters::begin();
242 $self->SUPER::begin(@_);
243
244}
245
246sub deinit {
247 my $self = shift (@_);
248
249 $self->AutoLoadConverters::deinit();
250 $self->SUPER::deinit(@_);
251
252}
253
254# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
255sub tmp_area_convert_file {
256 my $self = shift (@_);
257 my ($output_ext, $input_filename, $textref) = @_;
258
259 if ($self->{'openoffice_conversion'}) {
260 if ($self->{'convert_to'} eq "pagedimg") {
261 $output_ext = "html"; # first convert to html
262 }
263 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
264 if ($result == 0) {
265 my $outhandle=$self->{'outhandle'};
266 print $outhandle "OpenOfficeConverter Conversion error\n";
267 print $outhandle $result_str;
268 return "";
269
270 }
271 #print STDERR "result = $result\n";
272 if ($self->{'convert_to'} eq "pagedimg") {
273 my $item_filename = $self->generate_item_file($new_filename);
274 return $item_filename;
275 }
276 return $new_filename;
277
278 }
279 else {
280 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
281 }
282 # get tmp filename
283}
284
285# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
286sub read {
287 my $self = shift (@_);
288 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
289
290 # can we process this file??
291 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
292
293 return undef unless $self->can_process_this_file($filename_full_path);
294
295 # we are only doing something special for html_multi
296 if (!($self->{'openoffice_conversion'} && $self->{'convert_to'} eq "html_multi")) {
297 return $self->BasePlugin::read(@_);
298 }
299 my $outhandle = $self->{'outhandle'};
300 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
301 print $outhandle "$self->{'plugin_type'} processing $file\n"
302 if $self->{'verbosity'} > 1;
303
304 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path);
305 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
306 if (! -e "$conv_filename") {return -1;}
307
308 my ($tailname, $html_dirname, $suffix)
309 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
310
311 my $collect_file = &util::filename_within_collection($filename_full_path);
312 my $dirname_within_collection = &util::filename_within_collection($html_dirname);
313 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
314
315 my @dir;
316 if (!opendir (DIR, $html_dirname)) {
317 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
318 # just process the original file
319 @dir = ("$tailname.$suffix");
320
321 } else {
322 @dir = readdir (DIR);
323 closedir (DIR);
324 }
325
326 foreach my $file (@dir) {
327 next unless $file =~ /\.html$/;
328
329 my ($rv, $doc_obj) =
330 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
331 if ((!defined $rv) || ($rv<1)) {
332 # wasn't processed
333 return $rv;
334 }
335
336 # next block copied from ConvertBinaryFile
337 # from here ...
338 # Override previous gsdlsourcefilename set by secondary plugin
339
340 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
341 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
342 # build. so set it manually.
343 $doc_obj->set_source_path($filename_full_path);
344 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
345
346 my $plugin_filename_encoding = $self->{'filename_encoding'};
347 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
348 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
349
350 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
351 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
352
353
354 my ($tailname, $dirname, $suffix)
355 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
356 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
357
358
359 my $topsection = $doc_obj->get_top_section();
360 $self->add_associated_files($doc_obj, $filename_full_path);
361
362 # extra_metadata is already called by sec plugin in process??
363 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
364 # do any automatic metadata extraction
365 $self->auto_extract_metadata ($doc_obj);
366
367 # have we found a Title??
368 $self->title_fallback($doc_obj,$topsection,$filename_no_path);
369
370 # use the one generated by HTMLPlugin, otherwise they all end up with same id.
371 #$self->add_OID($doc_obj);
372 # to here...
373
374 # process it
375 $processor->process($doc_obj);
376 undef $doc_obj;
377 }
378 $self->{'num_processed'} ++;
379
380# my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
381
382# if ((defined $process_status) && ($process_status == 1)) {
383
384 # process the document
385# $processor->process($doc_obj);
386
387# $self->{'num_processed'} ++;
388# undef $doc_obj;
389# }
390 # delete any temp files that we may have created
391 $self->clean_up_after_doc_obj_processing();
392
393
394 # if process_status == 1, then the file has been processed.
395 return 1;
396
397}
398
399sub generate_item_file
400{
401 my $self = shift(@_);
402 my ($input_filename) = @_;
403 my $outhandle = $self->{'outhandle'};
404 my ($tailname, $dirname, $suffix)
405 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
406
407 # find all the files in the directory
408 if (!opendir (DIR, $dirname)) {
409 print $outhandle "PowerPointPlugin: Couldn't read directory $dirname\n";
410 return $input_filename;
411 }
412
413 my @dir = readdir (DIR);
414 closedir (DIR);
415
416 # start the item file
417 my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
418
419 # encoding specification????
420 if (!open (ITEMFILE, ">$itemfile_name")) {
421 print $outhandle "PowerPOintPlugin: Couldn't open $itemfile_name for writing\n";
422 }
423 print ITEMFILE "<GeneratedBy>PowerPointPlugin\n";
424 # print the first page
425 my @sorted_dir = sort alphanum_sort @dir;
426 for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
427 my $file = $sorted_dir[$i];
428 if ($file =~ /^img(\d+)\.jpg$/) {
429 my $num = $1;
430 $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
431 print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
432 }
433 }
434 close ITEMFILE;
435 return $itemfile_name;
436
437
438}
439
440# want to sort img1, img2, ...img10, img11 etc.
441sub alphanum_sort {
442
443 my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
444 my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
445
446 if ($a_txt ne $b_txt) { return ($a cmp $b) };
447 return ($a_num <=> $b_num);
448}
449
450# Want to remove the line that links to first page, last page, next page, text etc.
451sub tidy_up_html {
452
453 my $self = shift(@_);
454 my ($filename) = @_;
455 return unless (-f $filename);
456 my $backup_filename = "$filename.bak";
457
458 &File::Copy::copy($filename, $backup_filename);
459
460 open (ORIGINAL, $backup_filename) || return;
461 open(HTMLFILE, ">$filename") || return;
462
463 my $line ="";
464 while ($line = <ORIGINAL>) {
465 if ($line =~ /\<body\>/) {
466 print HTMLFILE $line;
467 $line = <ORIGINAL>;
468 next if $line =~ /\<center\>/;
469 }
470 next if $line =~ /First page/;
471 print HTMLFILE ($line);
472 }
473
474 close HTMLFILE;
475 close ORIGINAL;
476}
4771;
478
Note: See TracBrowser for help on using the repository browser.