root/gs2-extensions/open-office/trunk/src/perllib/plugins/OpenOfficePlugin.pm @ 32317

Revision 32317, 8.2 KB (checked in by ak19, 2 years ago)

Basic updates to OpenOffice? extension: 1. BasePlugin? is now BaseImporter? 2. Warnings notified a change from single minus prefix to accept and headless arguments, to double minus prefix. Still haven't resolved 2 bugs: GLI will get stuck when launching when the open office ext is included and if this gets stuck. Hard to track down so far, but this ext's jodconverter jar gets stuck running from the cmdline on these occasions too. And haven't resolved how to get jodconverter to convert (word) docs that contain images to HTML without always embedding the images inline as base64. jodconverter doesn't take the same names for options as the soffice commands it calls. Jodconverter's cmdline usage display is either not complete or the options are very limited.

Line 
1###########################################################################
2#
3# OpenOfficePlugin.pm -- for processing standalone images
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OpenOfficePlugin;
27
28use ConvertBinaryFile;
29use OpenOfficeConverter;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37sub BEGIN {
38    @OpenOfficePlugin::ISA = ('ConvertBinaryFile', 'OpenOfficeConverter');
39}
40
41my $word_pe = "doc|dot|docx|odt|wpd";
42my $rtf_pe = "rtf";
43my $ppt_pe = "ppt|pptx|odp";
44my $xls_pe = "xls|xlsx|ods";
45
46my $arguments =
47    [ { 'name' => "process_exp",
48    'desc' => "{BaseImporter.process_exp}",
49    'type' => "regexp",
50    'deft' => &get_default_process_exp(),
51    'reqd' => "no" },
52      ];
53
54# This is not quite fully functional, eg doesn't do proper PowerPoint processing. So I have made it hidden in gli for now.
55my $options = { 'name'     => "OpenOfficePlugin",
56        'desc'     => "{OpenOfficePlugin.desc}",
57        'abstract' => "no",
58        'inherits' => "yes",
59        'hiddengli' => "yes",
60        'args'     => $arguments };
61
62
63sub new {
64    my ($class) = shift (@_);
65    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
66    push(@$pluginlist, $class);
67
68    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
69    push(@{$hashArgOptLists->{"OptList"}},$options);
70
71    my $ooc_self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
72    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
73
74    my $self = BaseImporter::merge_inheritance($ooc_self, $cbf_self);
75
76    if ($self->{'info_only'}) {
77    # don't worry about any options etc
78    return bless $self, $class;
79    }
80    if ($OpenOfficeConverter::openoffice_conversion_available) {
81    $self->{'openoffice_ext_working'} = 1;
82    }
83    else {
84    $self->{'openoffice_ext_working'} = 0;
85    }
86    $self->{'convert_to'} = "structuredhtml";
87
88    $self = bless $self, $class;
89
90    # set convert_to_plugin and convert_to_ext
91    $self->set_standard_convert_settings();
92
93    # set up appropriate secondary plugin options here!!!
94    my $secondary_plugin_name = $self->{'convert_to_plugin'};
95    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
96
97    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
98    $secondary_plugin_options->{$secondary_plugin_name} = [];
99    }
100    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
101
102    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
103
104    return $self;
105}
106
107sub init {
108    my $self = shift (@_);
109    my ($verbosity, $outhandle, $failhandle) = @_;
110
111    $self->SUPER::init(@_);
112    $self->OpenOfficeConverter::init();
113}
114
115sub begin {
116    my $self = shift (@_);
117    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
118
119    $self->SUPER::begin(@_);
120    $self->OpenOfficeConverter::begin(@_);
121}
122
123
124sub get_default_process_exp {
125    my $self = shift (@_);
126
127    return "(?i)\.($word_pe|$ppt_pe|$rtf_pe|$xls_pe)\$";
128}
129
130
131sub read_into_doc_obj {
132    my $self = shift (@_); 
133    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
134   
135    if (!$self->{'openoffice_ext_working'}) {
136    # use BaseImporter version to set up the doc obj, then we will do a little bit more in process
137    return $self->BaseImporter::read_into_doc_obj(@_);
138    }
139    # use convertbinaryfile version, will call tmp_area_convert_file and use
140    # secondary plugins
141    return $self->ConvertBinaryFile::read_into_doc_obj(@_);
142
143}
144
145# override this to use our own convert method
146sub tmp_area_convert_file {
147    my $self = shift (@_);
148    my ($output_ext, $input_filename, $textref) = @_;
149
150    my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
151    if ($result != 0) {
152    return $new_filename;
153    }
154    my $outhandle=$self->{'outhandle'};
155    print $outhandle "Open Office Conversion error\n";
156    print $outhandle $result_str;
157    return "";
158}
159
160# first return value is used in _iconxx_ to give a srcicon,
161# second return value is used for FileFormat metadata
162sub get_file_type_from_extension {
163    my $self = shift (@_);
164    my ($file) = @_;
165
166    #check against the various bit of process_exp
167    if ($file =~ /$word_pe/) {
168    return ("doc", "Word");
169    }
170    if ($file =~ /$ppt_pe/) {
171    return ("ppt", "PPT");
172    }
173    if ($file =~ /$xls_pe/) {
174    return ("xls", "Excel");
175    }
176    if ($file =~ /$rtf_pe/) {
177    return ("rtf", "RTF");
178    }
179   
180    return ("unknown", "Unknown");
181}
182
183
184sub process {
185    my $self = shift (@_);
186    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
187
188    my $cursection = $doc_obj->get_top_section();
189   
190    # store original file as associated file
191    my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
192    my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
193    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
194
195    my ($ext, $format) = $self->get_file_type_from_extension($file);
196    # overwrite the one set by secondary plugin
197    $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $format);
198
199    $doc_obj->add_utf8_metadata ($cursection, "srclink_file", "[SourceFile]");
200    $doc_obj->add_utf8_metadata ($cursection, "srcicon",  "_icon".$ext."_");   
201    # if oo conversion not available, we have no text, so add some
202    if (!$self->{'openoffice_ext_working'}) {
203    if ($gli) {
204        &gsprintf(STDERR, "<Warning p='openOfficePlugin' r='{OpenOfficeConverter.noconversionavailable}: {OpenOfficeConverter.".$self->{'no_openoffice_conversion_reason'}."}'>");
205    }
206    print STDERR "OpenOfficePlugin: no conversion available, just adding $file as is\n";
207    #we have no text - adds dummy text and NoText metadata
208    $self->add_dummy_text($doc_obj, $doc_obj->get_top_section());
209   
210    }
211}
212
213sub process_old {
214    my $self = shift (@_);
215    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
216    # old from here
217   # need to check that not empty
218    my $doc_ext = $self->{'filename_extension'};
219    my $file_type = "unknown";
220    $file_type = $self->{'file_type'} if defined $self->{'file_type'};
221   
222    # associate original file with doc object
223    my $cursection = $doc_obj->get_top_section();
224    my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
225    my $assocfilename = "doc.$doc_ext";
226    if ($self->{'keep_original_filename'} == 1) {
227    # this should be the same filename that was used for the Source and SourceFile metadata,
228    # as we will use [SourceFile] in the srclink
229    $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
230    }
231    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
232
233    # We use set instead of add here because we only want one value
234    $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
235    my $srclink_filename = "doc.$doc_ext";
236    #my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/doc.$doc_ext\">";
237    if ($self->{'keep_original_filename'} == 1) {
238    $srclink_filename = "[SourceFile]";
239    #$doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[SourceFile]\">";
240    }
241    #$doc_obj->add_utf8_metadata ($cursection, "srclink",  $doclink);
242    $doc_obj->add_utf8_metadata ($cursection, "srcicon",  "_icon".$doc_ext."_");
243    #$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
244    $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
245    return 1;
246
247}
Note: See TracBrowser for help on using the browser.