root/main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm @ 22880

Revision 22880, 14.5 KB (checked in by kjdon, 9 years ago)

implemented the read method for when using open office to convert to html multi - the powerpoint gets converted to individual html files, two per slide. one for the image, one for the text. each one gets passed to HTMLPlugin for processing, so all the slides end up as individual documents, but the first page, back, continue etc links work to link them all together

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# PowerPointPlugin.pm -- plugin for importing Microsoft PowerPoint files.
4#  (basic version supports versions 95 and 97)
5#  (through OpenOffice extension, supports all contemporary formats)
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29package PowerPointPlugin;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37use AutoLoadConverters;
38use ConvertBinaryFile;
39
40sub BEGIN {
41    @PowerPointPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
42}
43
44my $openoffice_available = 0;
45
46my $windows_convert_to_list =
47    [ { 'name' => "auto",
48    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
49      { 'name' => "html",
50    'desc' => "{ConvertBinaryFile.convert_to.html}" },
51      { 'name' => "text",
52    'desc' => "{ConvertBinaryFile.convert_to.text}" },
53      { 'name' => "pagedimg_jpg",
54    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
55      { 'name' => "pagedimg_gif",
56    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
57      { 'name' => "pagedimg_png",
58    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
59      ];
60
61my $openoffice_convert_to_list =
62    [ { 'name' => "auto",
63    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
64      { 'name' => "html_multi",
65    'desc' => "{PowerPointPlugin.convert_to.html_multi}" },
66      { 'name' => "text",
67    'desc' => "{ConvertBinaryFile.convert_to.text}" },
68      { 'name' => "pagedimg",
69    'desc' => "{PowerPointPlugin.convert_to.pagedimg}" }
70      ];
71
72my $arguments =
73    [ { 'name' => "process_exp",
74    'desc' => "{BasePlugin.process_exp}",
75    'type' => "regexp",
76    'reqd' => "no",
77    'deft' => "&get_default_process_exp()",  # delayed (see below)
78    }
79      ];
80
81my $opt_windows_args =
82    [ { 'name' => "convert_to",
83    'desc' => "{ConvertBinaryFile.convert_to}",
84    'type' => "enum",
85    'reqd' => "yes",
86    'list' => $windows_convert_to_list,
87    'deft' => "html" },
88      { 'name' => "windows_scripting",
89    'desc' => "{PowerPointPlugin.windows_scripting}",
90    'type' => "flag",
91    'reqd' => "no" }
92      ];
93
94my $opt_office_args =
95    [ { 'name' => "convert_to",
96    'desc' => "{ConvertBinaryFile.convert_to}",
97    'type' => "enum",
98    'reqd' => "yes",
99    'list' => $openoffice_convert_to_list,
100    'deft' => "html" }
101      ];
102
103my $options = { 'name'     => "PowerPointPlugin",
104        'desc'     => "{PowerPointPlugin.desc}",
105        'abstract' => "no",
106        'inherits' => "yes",
107        'srcreplaceable' => "yes", # Source docs in PPT format can be replaced with GS-generated html
108            'args'     => $arguments };
109
110sub new {
111    my ($class) = shift (@_);
112    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
113    push(@$pluginlist, $class);
114
115    # this bit needs to happen later after the arguments array has been
116    # finished - used for parsing the input args.
117    # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
118    # this one needs to go in first, to get the print info in the right order
119    push(@{$hashArgOptLists->{"OptList"}},$options);
120
121    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
122    push(@$arguments,@$opt_windows_args);
123    }
124
125    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
126
127    if ($auto_converter_self->{'openoffice_available'}) {
128    push (@$arguments,@$opt_office_args);
129    $openoffice_available = 1;
130    }
131    # TODO need to do the case where they are both enabled!!! what will the convert to list be???
132
133    # evaluate the default for process_exp  - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
134    foreach my $a (@$arguments) {
135    if ($a->{'name'} eq "process_exp") {
136        my $eval_expr = $a->{'deft'};
137        $a->{'deft'} = eval "$eval_expr";
138        last;
139    }
140    }
141
142    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143
144    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
145    my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
146
147    if ($self->{'info_only'}) {
148    # don't worry about any options etc
149    return bless $self, $class;
150    }
151
152    $self = bless $self, $class;
153    $self->{'file_type'} = "PPT";
154
155    if ($self->{'convert_to'} eq "auto") {
156    if ($self->{'windows_scripting'}) {
157        $self->{'convert_to'} = "pagedimg_jpg";
158    }
159    else {
160        $self->{'convert_to'} = "html";
161    }
162    }
163
164   my $outhandle = $self->{'outhandle'};
165
166    # can't have windows_scripting and openoffice_conversion at the same time
167    if ($self->{'windows_scripting'} && $self->{'openoffice_conversion'}) {
168    print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
169    print $outhandle "         on at the same time.  Defaulting to -windows_scripting\n";
170    $self->{'openoffice_conversion'} = 0;
171    }
172   
173    #these are passed through to gsConvert.pl by ConvertBinaryFile.pm
174    $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
175
176    # set convert_to_plugin and convert_to_ext
177    $self->set_standard_convert_settings();
178
179    my $secondary_plugin_name = $self->{'convert_to_plugin'};
180    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
181
182    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
183    $secondary_plugin_options->{$secondary_plugin_name} = [];
184    }
185    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
186
187    push(@$specific_options, "-file_rename_method", "none");
188    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
189
190    if ($secondary_plugin_name eq "HTMLPlugin") {
191    push(@$specific_options, "-processing_tmp_files");
192    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
193    }
194    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
195    push(@$specific_options, "-processing_tmp_files");
196    #is this true??
197    push(@$specific_options,"-input_encoding", "utf8");
198    if ($self->{'openoffice_conversion'}) {
199        push(@$specific_options, "-create_thumbnail", "false", "-create_screenview", "false");
200    }
201    }
202
203    $self = bless $self, $class;
204
205    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
206    return $self;
207}
208
209sub get_default_process_exp {
210    my $self = shift (@_);
211
212    if ($openoffice_available) {
213    return q^(?i)\.(ppt|pptx|odp)$^;
214    }
215
216    return q^(?i)\.ppt$^;
217}
218
219sub init {
220    my $self = shift (@_);
221
222    # ConvertBinaryFile init
223    $self->SUPER::init(@_);
224    $self->AutoLoadConverters::init();
225
226}
227
228sub begin {
229    my $self = shift (@_);
230
231    $self->AutoLoadConverters::begin();
232    $self->SUPER::begin(@_);
233
234}
235
236sub deinit {
237    my $self = shift (@_);
238   
239    $self->AutoLoadConverters::deinit();
240    $self->SUPER::deinit(@_);
241
242}
243
244# override AutoLoadConverters version, as we need to do more stuff once its converted if we are converting to item file
245sub tmp_area_convert_file {
246    my $self = shift (@_);
247    my ($output_ext, $input_filename, $textref) = @_;
248
249    if ($self->{'openoffice_conversion'}) {
250    if ($self->{'convert_to'} eq "pagedimg") {
251        $output_ext = "html"; # first convert to html
252    }
253    my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
254    if ($result == 0) {
255        my $outhandle=$self->{'outhandle'};
256        print $outhandle "OpenOfficeConverter Conversion error\n";
257        print $outhandle $result_str;
258        return "";
259
260    }
261    #print STDERR "result = $result\n";
262    if ($self->{'convert_to'} eq "pagedimg") {
263        my $item_filename = $self->generate_item_file($new_filename);
264        return $item_filename;
265    }
266    return $new_filename;
267
268    }
269    else {
270    return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
271    }
272    # get tmp filename
273}
274
275# override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
276sub read {
277    my $self = shift (@_); 
278    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
279
280    # can we process this file??
281    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
282
283    return undef unless $self->can_process_this_file($filename_full_path);
284   
285    # we are only doing something special for html_multi
286    if (!($self->{'openoffice_conversion'} && $self->{'convert_to'} eq "html_multi")) {
287    return $self->BasePlugin::read(@_);
288    }
289    my $outhandle = $self->{'outhandle'};
290    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
291    print $outhandle "$self->{'plugin_type'} processing $file\n"
292        if $self->{'verbosity'} > 1;
293
294    my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path);
295    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
296    if (! -e "$conv_filename") {return -1;}
297
298    my ($tailname, $html_dirname, $suffix)
299    = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
300
301    my $collect_file = &util::filename_within_collection($filename_full_path);
302    my $dirname_within_collection = &util::filename_within_collection($html_dirname);
303    my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
304
305    my @dir;
306    if (!opendir (DIR, $html_dirname)) {
307    print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
308    # just process the original file
309    @dir = ("$tailname.$suffix");
310   
311    } else {
312    @dir = readdir (DIR);
313    closedir (DIR);
314    }
315
316    foreach my $file (@dir) {
317    next unless $file =~ /\.html$/;
318   
319    my ($rv, $doc_obj) =
320        $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
321    if ((!defined $rv) || ($rv<1)) {
322        # wasn't processed
323        return $rv;
324    }
325
326    # next block copied from ConvertBinaryFile
327    # from here ...
328    # Override previous gsdlsourcefilename set by secondary plugin
329   
330    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
331    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
332    # build. so set it manually.
333    $doc_obj->{'source_path'} = $filename_full_path;
334    $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
335   
336    $self->set_Source_metadata($doc_obj, $filename_no_path);
337       
338    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
339    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
340
341   
342    my ($tailname, $dirname, $suffix)
343        = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
344    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
345   
346
347    my $topsection = $doc_obj->get_top_section();
348    $self->add_associated_files($doc_obj, $filename_full_path);
349   
350    # extra_metadata is already called by sec plugin in process??
351    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
352    # do any automatic metadata extraction
353    $self->auto_extract_metadata ($doc_obj);
354   
355    # have we found a Title??
356    $self->title_fallback($doc_obj,$topsection,$filename_no_path);
357   
358    # use the one generated by HTMLPlugin, otherwise they all end up with same id.
359    #$self->add_OID($doc_obj);
360    # to here...
361
362    # process it
363    $processor->process($doc_obj);
364    undef $doc_obj;
365    }
366    $self->{'num_processed'} ++;
367
368#    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
369   
370#    if ((defined $process_status) && ($process_status == 1)) {
371   
372    # process the document
373#   $processor->process($doc_obj);
374
375#   $self->{'num_processed'} ++;
376#   undef $doc_obj;
377#    }
378    # delete any temp files that we may have created
379    $self->clean_up_after_doc_obj_processing();
380
381
382    # if process_status == 1, then the file has been processed.
383    return 1;
384
385}
386
387sub generate_item_file
388{
389    my $self = shift(@_);
390    my ($input_filename) = @_;
391    my $outhandle = $self->{'outhandle'};
392    my ($tailname, $dirname, $suffix)
393    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
394
395    # find all the files in the directory
396    if (!opendir (DIR, $dirname)) {
397    print $outhandle "PowerPointPlugin: Couldn't read directory $dirname\n";
398    return $input_filename;
399    }
400
401    my @dir = readdir (DIR);
402    closedir (DIR);
403
404    # start the item file
405    my $itemfile_name = &util::filename_cat($dirname, "$tailname.item");
406
407    # encoding specification????
408    if (!open (ITEMFILE, ">$itemfile_name")) {
409    print $outhandle "PowerPOintPlugin: Couldn't open $itemfile_name for writing\n";
410    }
411    print ITEMFILE "<GeneratedBy>PowerPointPlugin\n";
412    # print the first page
413    my @sorted_dir = sort alphanum_sort @dir;
414    for (my $i = 0; $i < scalar(@sorted_dir); $i++) {
415    my $file = $sorted_dir[$i];
416    if ($file =~ /^img(\d+)\.jpg$/) {
417        my $num = $1;
418        $self->tidy_up_html(&util::filename_cat($dirname, "text$num.html"));
419        print ITEMFILE "$num:img$num.jpg:text$num.html:\n";
420    }
421    }
422    close ITEMFILE;
423    return $itemfile_name;
424
425   
426}
427
428# want to sort img1, img2, ...img10, img11 etc.
429sub alphanum_sort {
430   
431    my ($a_txt, $a_num) = $a =~ /^([^\d]*)(\d*)/;
432    my ($b_txt, $b_num) = $b =~ /^([^\d]*)(\d*)/;
433   
434    if ($a_txt ne $b_txt) { return ($a cmp $b) };
435    return ($a_num <=> $b_num);
436}
437
438# Want to remove the line that links to first page, last page, next page, text etc.
439sub tidy_up_html {
440
441    my $self = shift(@_);
442    my ($filename) = @_;
443    return unless (-f $filename);
444    my $backup_filename = "$filename.bak";
445
446    &File::Copy::copy($filename, $backup_filename);
447
448    open (ORIGINAL, $backup_filename) || return;
449    open(HTMLFILE, ">$filename") || return;
450
451    my $line ="";
452    while ($line = <ORIGINAL>) {
453    if ($line =~ /\<body\>/) {
454        print HTMLFILE $line;
455        $line = <ORIGINAL>;
456        next if $line =~ /\<center\>/;
457    }
458    next if $line =~ /First page/;
459    print HTMLFILE ($line);
460    }
461
462    close HTMLFILE;
463    close ORIGINAL;
464}
4651;
466
Note: See TracBrowser for help on using the browser.