root/main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm @ 22874

Revision 22874, 11.4 KB (checked in by kjdon, 9 years ago)

no longer use filename_extension, as we should be using the original extension . eg when processing an odt doc with word plugin, the associated file used to be doc.doc, instead of doc.odt.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
4#  (basic version supports versions 95 and 97)
5#  (through OpenOffice extension, supports all contemporary formats)
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package PowerPointPlugin;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37use AutoLoadConverters;
38use ConvertBinaryFile;
39
40sub BEGIN {
41    @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
43
44my $openoffice_available = 0;
45
46my $windows_convert_to_list =
47    [ { 'name' => "auto",
48    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49      { 'name' => "html",
50    'desc' => "{ConvertBinaryFile.convert_to.html}" },
51      { 'name' => "text",
52    'desc' => "{ConvertBinaryFile.convert_to.text}" },
53      { 'name' => "pagedimg_jpg",
54    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
55      { 'name' => "pagedimg_gif",
56    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
57      { 'name' => "pagedimg_png",
58    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
59      ];
60
61my $openoffice_convert_to_list =
62    [ { 'name' => "auto",
63    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
64      { 'name' => "html",
65    'desc' => "{PowerPointPlugin.convert_to.oo_html}" },
66      { 'name' => "text",
67    'desc' => "{ConvertBinaryFile.convert_to.text}" },
68      { 'name' => "pagedimg",
69    'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70      ];
71
72my $arguments =
73    [ { 'name' => "process_exp",
74    'desc' => "{BasePlugin.process_exp}",
75    'type' => "regexp",
76    'reqd' => "no",
77    'deft' => "&get_default_process_exp()",  # delayed (see below)
78    }
79      ];
80
81my $opt_windows_args =
82    [ { 'name' => "convert_to",
83    'desc' => "{ConvertBinaryFile.convert_to}",
84    'type' => "enum",
85    'reqd' => "yes",
86    'list' => $windows_convert_to_list,
87    'deft' => "html" },
88      { 'name' => "windows_scripting",
89    'desc' => "{PowerPointPlugin.windows_scripting}",
90    'type' => "flag",
91    'reqd' => "no" }
92      ];
93
94my $opt_office_args =
95    [ { 'name' => "convert_to",
96    'desc' => "{ConvertBinaryFile.convert_to}",
97    'type' => "enum",
98    'reqd' => "yes",
99    'list' => $openoffice_convert_to_list,
100    'deft' => "html" }
101      ];
102
103my $options = { 'name'     => "PowerPointPlugin",
104        'desc'     => "{PowerPointPlugin.desc}",
105        'abstract' => "no",
106        'inherits' => "yes",
107        'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
108            'args'     => $arguments };
109
110sub new {
111    my ($class) = shift (@_);
112    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
113    push(@$pluginlist, $class);
114
115    # this bit needs to happen later after the arguments array has been
116    # finished - used for parsing the input args.
117    # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
118    # this one needs to go in first, to get the print info in the right order
119    push(@{$hashArgOptLists->{"OptList"}},$options);
120
121    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
122    push(@$arguments,@$opt_windows_args);
123    }
124
125    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
126
127    if ($auto_converter_self->{'openoffice_available'}) {
128    push (@$arguments,@$opt_office_args);
129    $openoffice_available = 1;
130    }
131    # TODO need to do the case where they are both enabled!!! what will the convert to list be???
132
133    # evaluate the default for process_exp  - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
134    foreach my $a (@$arguments) {
135    if ($a->{'name'} eq "process_exp") {
136        my $eval_expr = $a->{'deft'};
137        $a->{'deft'} = eval "$eval_expr";
138        last;
139    }
140    }
141
142    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143
144    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
145    my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
146
147    if ($self->{'info_only'}) {
148    # don't worry about any options etc
149    return bless $self, $class;
150    }
151
152    $self = bless $self, $class;
153    $self->{'file_type'} = "PPT";
154
155    if ($self->{'convert_to'} eq "auto") {
156    if ($self->{'windows_scripting'}) {
157        $self->{'convert_to'} = "pagedimg_jpg";
158    }
159    else {
160        $self->{'convert_to'} = "html";
161    }
162    }
163
164   my $outhandle = $self->{'outhandle'};
165
166    # can't have windows_scripting and openoffice_conversion at the same time
167    if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
168    print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
169    print $outhandle "         on at the same time.  Defaulting to -windows_scripting\n";
170    $self->{'openoffice_conversion'} = 0;
171    }
172   
173    #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
174    $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
175
176    # set convert_to_plugin and convert_to_ext
177    $self->set_standard_convert_settings();
178
179    my $secondary_plugin_name = $self->{'convert_to_plugin'};
180    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
181
182    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
183    $secondary_plugin_options->{$secondary_plugin_name} = [];
184    }
185    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
186
187    push(@$specific_options, "-file_rename_method", "none");
188    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
189
190    if ($secondary_plugin_name eq "HTMLPlugin") {
191    push(@$specific_options, "-processing_tmp_files");
192    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
193    }
194    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
195    push(@$specific_options, "-processing_tmp_files");
196    #is this true??
197    push(@$specific_options,"-input_encoding", "utf8");
198    if ($self->{'openoffice_conversion'}) {
199        push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
200    }
201    }
202
203    $self = bless $self, $class;
204
205    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
206    return $self;
207}
208
209sub get_default_process_exp {
210    my $self = shift (@_);
211
212    if ($openoffice_available) {
213    return q^(?i)\.(ppt|pptx|odp)$^;
214    }
215
216    return q^(?i)\.ppt$^;
217}
218
219sub init {
220    my $self = shift (@_);
221
222    # ConvertBinaryFile init
223    $self->SUPER::init(@_);
224    $self->AutoLoadConverters::init();
225
226}
227
228sub begin {
229    my $self = shift (@_);
230
231    $self->AutoLoadConverters::begin();
232    $self->SUPER::begin(@_);
233
234}
235
236sub deinit {
237    my $self = shift (@_);
238   
239    $self->AutoLoadConverters::deinit();
240    $self->SUPER::deinit(@_);
241
242}
243
244# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
245sub tmp_area_convert_file {
246    my $self = shift (@_);
247    my ($output_ext, $input_filename, $textref) = @_;
248
249    if ($self->{'openoffice_conversion'}) {
250    if ($self->{'convert_to'} eq "pagedimg") {
251        $output_ext = "html"; # first convert to html
252    }
253    my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
254    if ($result == 0) {
255        my $outhandle=$self->{'outhandle'};
256        print $outhandle "OpenOfficeConverter Conversion error\n";
257        print $outhandle $result_str;
258        return "";
259
260    }
261    #print STDERR "result = $result\n";
262    if ($self->{'convert_to'} eq "pagedimg") {
263        my $item_filename = $self->generate_item_file($new_filename);
264        return $item_filename;
265    }
266    return $new_filename;
267
268    }
269    else {
270    return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
271    }
272    # get tmp filename
273}
274
275# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
276sub read_XX {
277    my $self = shift (@_); 
278    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
279
280    # can we process this file??
281    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
282
283    return undef unless $self->can_process_this_file($filename_full_path);
284   
285    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
286   
287    if ((defined $process_status) && ($process_status == 1)) {
288   
289    # process the document
290    $processor->process($doc_obj);
291
292    $self->{'num_processed'} ++;
293    undef $doc_obj;
294    }
295    # delete any temp files that we may have created
296    $self->clean_up_after_doc_obj_processing();
297
298
299    # if process_status == 1, then the file has been processed.
300    return $process_status;
301
302}
303
304sub generate_item_file
305{
306    my $self = shift(@_);
307    my ($input_filename) = @_;
308    my $outhandle = $self->{'outhandle'};
309    my ($tailname, $dirname, $suffix)
310    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
311
312    # find all the files in the directory
313    if (!opendir (DIR, $dirname)) {
314    print $outhandle "PowerPointPlugin: Couldn't read directory $dirname\n";
315    return $input_filename;
316    }
317
318    my @dir = readdir (DIR);
319    closedir (DIR);
320
321    # start the item file
322    my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
323
324    # encoding specification????
325    if (!open (ITEMFILE, ">$itemfile_name")) {
326    print $outhandle "PowerPOintPlugin: Couldn't open $itemfile_name for writing\n";
327    }
328    print ITEMFILE "<GeneratedBy>PowerPointPlugin\n";
329    # print the first page
330    my @sorted_dir = sort alphanum_sort @dir;
331    for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
332    my $file = $sorted_dir[$i];
333    if ($file =~ /^img(\d+)\.jpg$/) {
334        my $num = $1;
335        $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
336        print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
337    }
338    }
339    close ITEMFILE;
340    return $itemfile_name;
341
342   
343}
344
345# want to sort img1, img2, ...img10, img11 etc.
346sub alphanum_sort {
347   
348    my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
349    my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
350   
351    if ($a_txt ne $b_txt) { return ($a cmp $b) };
352    return ($a_num <=> $b_num);
353}
354
355# Want to remove the line that links to first page, last page, next page, text etc.
356sub tidy_up_html {
357
358    my $self = shift(@_);
359    my ($filename) = @_;
360    return unless (-f $filename);
361    my $backup_filename = "$filename.bak";
362
363    &File::Copy::copy($filename, $backup_filename);
364
365    open (ORIGINAL, $backup_filename) || return;
366    open(HTMLFILE, ">$filename") || return;
367
368    my $line ="";
369    while ($line = <ORIGINAL>) {
370    if ($line =~ /\<body\>/) {
371        print HTMLFILE $line;
372        $line = <ORIGINAL>;
373        next if $line =~ /\<center\>/;
374    }
375    next if $line =~ /First page/;
376    print HTMLFILE ($line);
377    }
378
379    close HTMLFILE;
380    close ORIGINAL;
381}
3821;
383
Note: See TracBrowser for help on using the browser.