source: gs2-extensions/open-office/trunk/src/perllib/plugins/OpenOfficePlugin.pm@ 32317

Last change on this file since 32317 was 32317, checked in by ak19, 6 years ago

Basic updates to OpenOffice extension: 1. BasePlugin is now BaseImporter 2. Warnings notified a change from single minus prefix to accept and headless arguments, to double minus prefix. Still haven't resolved 2 bugs: GLI will get stuck when launching when the open office ext is included and if this gets stuck. Hard to track down so far, but this ext's jodconverter jar gets stuck running from the cmdline on these occasions too. And haven't resolved how to get jodconverter to convert (word) docs that contain images to HTML without always embedding the images inline as base64. jodconverter doesn't take the same names for options as the soffice commands it calls. Jodconverter's cmdline usage display is either not complete or the options are very limited.

File size: 8.2 KB
Line 
1###########################################################################
2#
3# OpenOfficePlugin.pm -- for processing standalone images
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OpenOfficePlugin;
27
28use ConvertBinaryFile;
29use OpenOfficeConverter;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33no strict 'subs';
34
35use gsprintf 'gsprintf';
36
37sub BEGIN {
38 @OpenOfficePlugin::ISA = ('ConvertBinaryFile', 'OpenOfficeConverter');
39}
40
41my $word_pe = "doc|dot|docx|odt|wpd";
42my $rtf_pe = "rtf";
43my $ppt_pe = "ppt|pptx|odp";
44my $xls_pe = "xls|xlsx|ods";
45
46my $arguments =
47 [ { 'name' => "process_exp",
48 'desc' => "{BaseImporter.process_exp}",
49 'type' => "regexp",
50 'deft' => &get_default_process_exp(),
51 'reqd' => "no" },
52 ];
53
54# This is not quite fully functional, eg doesn't do proper PowerPoint processing. So I have made it hidden in gli for now.
55my $options = { 'name' => "OpenOfficePlugin",
56 'desc' => "{OpenOfficePlugin.desc}",
57 'abstract' => "no",
58 'inherits' => "yes",
59 'hiddengli' => "yes",
60 'args' => $arguments };
61
62
63sub new {
64 my ($class) = shift (@_);
65 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
66 push(@$pluginlist, $class);
67
68 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
69 push(@{$hashArgOptLists->{"OptList"}},$options);
70
71 my $ooc_self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
72 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
73
74 my $self = BaseImporter::merge_inheritance($ooc_self, $cbf_self);
75
76 if ($self->{'info_only'}) {
77 # don't worry about any options etc
78 return bless $self, $class;
79 }
80 if ($OpenOfficeConverter::openoffice_conversion_available) {
81 $self->{'openoffice_ext_working'} = 1;
82 }
83 else {
84 $self->{'openoffice_ext_working'} = 0;
85 }
86 $self->{'convert_to'} = "structuredhtml";
87
88 $self = bless $self, $class;
89
90 # set convert_to_plugin and convert_to_ext
91 $self->set_standard_convert_settings();
92
93 # set up appropriate secondary plugin options here!!!
94 my $secondary_plugin_name = $self->{'convert_to_plugin'};
95 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
96
97 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
98 $secondary_plugin_options->{$secondary_plugin_name} = [];
99 }
100 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
101
102 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
103
104 return $self;
105}
106
107sub init {
108 my $self = shift (@_);
109 my ($verbosity, $outhandle, $failhandle) = @_;
110
111 $self->SUPER::init(@_);
112 $self->OpenOfficeConverter::init();
113}
114
115sub begin {
116 my $self = shift (@_);
117 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
118
119 $self->SUPER::begin(@_);
120 $self->OpenOfficeConverter::begin(@_);
121}
122
123
124sub get_default_process_exp {
125 my $self = shift (@_);
126
127 return "(?i)\.($word_pe|$ppt_pe|$rtf_pe|$xls_pe)\$";
128}
129
130
131sub read_into_doc_obj {
132 my $self = shift (@_);
133 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
134
135 if (!$self->{'openoffice_ext_working'}) {
136 # use BaseImporter version to set up the doc obj, then we will do a little bit more in process
137 return $self->BaseImporter::read_into_doc_obj(@_);
138 }
139 # use convertbinaryfile version, will call tmp_area_convert_file and use
140 # secondary plugins
141 return $self->ConvertBinaryFile::read_into_doc_obj(@_);
142
143}
144
145# override this to use our own convert method
146sub tmp_area_convert_file {
147 my $self = shift (@_);
148 my ($output_ext, $input_filename, $textref) = @_;
149
150 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
151 if ($result != 0) {
152 return $new_filename;
153 }
154 my $outhandle=$self->{'outhandle'};
155 print $outhandle "Open Office Conversion error\n";
156 print $outhandle $result_str;
157 return "";
158}
159
160# first return value is used in _iconxx_ to give a srcicon,
161# second return value is used for FileFormat metadata
162sub get_file_type_from_extension {
163 my $self = shift (@_);
164 my ($file) = @_;
165
166 #check against the various bit of process_exp
167 if ($file =~ /$word_pe/) {
168 return ("doc", "Word");
169 }
170 if ($file =~ /$ppt_pe/) {
171 return ("ppt", "PPT");
172 }
173 if ($file =~ /$xls_pe/) {
174 return ("xls", "Excel");
175 }
176 if ($file =~ /$rtf_pe/) {
177 return ("rtf", "RTF");
178 }
179
180 return ("unknown", "Unknown");
181}
182
183
184sub process {
185 my $self = shift (@_);
186 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
187
188 my $cursection = $doc_obj->get_top_section();
189
190 # store original file as associated file
191 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
192 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
193 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
194
195 my ($ext, $format) = $self->get_file_type_from_extension($file);
196 # overwrite the one set by secondary plugin
197 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $format);
198
199 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", "[SourceFile]");
200 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$ext."_");
201 # if oo conversion not available, we have no text, so add some
202 if (!$self->{'openoffice_ext_working'}) {
203 if ($gli) {
204 &gsprintf(STDERR, "<Warning p='openOfficePlugin' r='{OpenOfficeConverter.noconversionavailable}: {OpenOfficeConverter.".$self->{'no_openoffice_conversion_reason'}."}'>");
205 }
206 print STDERR "OpenOfficePlugin: no conversion available, just adding $file as is\n";
207 #we have no text - adds dummy text and NoText metadata
208 $self->add_dummy_text($doc_obj, $doc_obj->get_top_section());
209
210 }
211}
212
213sub process_old {
214 my $self = shift (@_);
215 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
216 # old from here
217 # need to check that not empty
218 my $doc_ext = $self->{'filename_extension'};
219 my $file_type = "unknown";
220 $file_type = $self->{'file_type'} if defined $self->{'file_type'};
221
222 # associate original file with doc object
223 my $cursection = $doc_obj->get_top_section();
224 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
225 my $assocfilename = "doc.$doc_ext";
226 if ($self->{'keep_original_filename'} == 1) {
227 # this should be the same filename that was used for the Source and SourceFile metadata,
228 # as we will use [SourceFile] in the srclink
229 $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
230 }
231 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
232
233 # We use set instead of add here because we only want one value
234 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
235 my $srclink_filename = "doc.$doc_ext";
236 #my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/doc.$doc_ext\">";
237 if ($self->{'keep_original_filename'} == 1) {
238 $srclink_filename = "[SourceFile]";
239 #$doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[SourceFile]\">";
240 }
241 #$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
242 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
243 #$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
244 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $srclink_filename);
245 return 1;
246
247}
Note: See TracBrowser for help on using the repository browser.