source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22611

Last change on this file since 22611 was 22611, checked in by kjdon, 14 years ago

now uses OpenOfficeConverter that is not ConvertBinaryFile

  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use ConvertBinaryFile;
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32no strict 'subs';
33use gsprintf 'gsprintf';
34
35# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile
36
37# do not initialise these variables
38my $openoffice_ext_installed;
39my $openoffice_ext_working;
40sub BEGIN {
41 eval("require OpenOfficeConverter");
42 if ($@) {
43 # Useful debugging statement if there is a syntax error in OpenOfficeConverter:
44 print STDERR "$@\n";
45 #@WordPlugin::ISA = ('ConvertBinaryFile');
46 $openoffice_ext_installed = 0;
47 $openoffice_ext_working = 0;
48 }
49 else {
50 # Successfully found
51 $openoffice_ext_installed = 1;
52 # now check whether it can run soffice
53 if ($OpenOfficeConverter::openoffice_conversion_available) {
54 #@WordPlugin::ISA = ('OpenOfficeConverter');
55 $openoffice_ext_working = 1;
56
57 } else {
58 #@WordPlugin::ISA = ('ConvertBinaryFile');
59 $openoffice_ext_working = 0;
60 }
61 }
62
63 if ($openoffice_ext_working) {
64 @WordPlugin::ISA = ('ConvertBinaryFile', 'OpenOfficeConverter');
65 } else {
66 @WordPlugin::ISA = ('ConvertBinaryFile');
67 }
68}
69
70my $arguments =
71 [ { 'name' => "process_exp",
72 'desc' => "{BasePlugin.process_exp}",
73 'type' => "regexp",
74 'deft' => &get_default_process_exp(),
75 'reqd' => "no" },
76 { 'name' => "description_tags",
77 'desc' => "{HTMLPlugin.description_tags}",
78 'type' => "flag" }
79 ];
80
81
82my $opt_windows_args = [ { 'name' => "windows_scripting",
83 'desc' => "{WordPlugin.windows_scripting}",
84 'type' => "flag",
85 'reqd' => "no" } ];
86
87my $opt_openoffice_args =
88 [ { 'name' => "openoffice_scripting",
89 'desc' => "{OpenOfficeConverter.openoffice_scripting}",
90 'type' => "flag",
91 'reqd' => "no" } ];
92
93my $opt_office_args = [ { 'name' => "metadata_fields",
94 'desc' => "{WordPlugin.metadata_fields}",
95 'type' => "string",
96 'deft' => "Title" },
97 { 'name' => "level1_header",
98 'desc' => "{StructuredHTMLPlugin.level1_header}",
99 'type' => "regexp",
100 'reqd' => "no",
101 'deft' => "" },
102 { 'name' => "level2_header",
103 'desc' => "{StructuredHTMLPlugin.level2_header}",
104 'type' => "regexp",
105 'reqd' => "no",
106 'deft' => "" },
107 { 'name' => "level3_header",
108 'desc' => "{StructuredHTMLPlugin.level3_header}",
109 'type' => "regexp",
110 'reqd' => "no",
111 'deft' => "" },
112 { 'name' => "title_header",
113 'desc' => "{StructuredHTMLPlugin.title_header}",
114 'type' => "regexp",
115 'reqd' => "no",
116 'deft' => "" },
117 { 'name' => "delete_toc",
118 'desc' => "{StructuredHTMLPlugin.delete_toc}",
119 'type' => "flag",
120 'reqd' => "no" },
121 { 'name' => "toc_header",
122 'desc' => "{StructuredHTMLPlugin.toc_header}",
123 'type' => "regexp",
124 'reqd' => "no",
125 'deft' => "" } ];
126
127
128my $options = { 'name' => "WordPlugin",
129 'desc' => "{WordPlugin.desc}",
130 'abstract' => "no",
131 'inherits' => "yes",
132 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
133 'args' => $arguments };
134
135sub new {
136 my ($class) = shift (@_);
137 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
138 push(@$pluginlist, $class);
139
140 if ($openoffice_ext_installed) {
141 print STDERR "WordPlugin: OpenOffice Extension to Greenstone detected\n";
142 if ($openoffice_ext_working) {
143 print STDERR "... and it appears to be working\n";
144 } else {
145 print STDERR "... but it appears to be broken\n";
146 &gsprintf(STDERR, "OpenOfficeConverter: {OpenOfficeConverter.noconversionavailable} ({OpenOfficeConverter.$OpenOfficeConverter::no_openoffice_conversion_reason})\n");
147 }
148 }
149
150 my $office_capable = 0;
151 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
152 push(@$arguments,@$opt_windows_args);
153 $office_capable = 1;
154 }
155 if ($openoffice_ext_working) {
156 push(@$arguments,@$opt_openoffice_args);
157 $office_capable = 1;
158 }
159 # these office args apply to windows scripting or to openoffice scripting
160 if ($office_capable) {
161 push(@$arguments,@$opt_office_args);
162 }
163
164 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
165 push(@{$hashArgOptLists->{"OptList"}},$options);
166
167 my $self = {};
168 if ($openoffice_ext_working) {
169
170 #$self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists);
171 my $ooc_self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists);
172 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
173 $self = BasePlugin::merge_inheritance($ooc_self, $cbf_self);
174 }
175 else {
176 $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
177 }
178
179 if ($self->{'info_only'}) {
180 # don't worry about any options etc
181 return bless $self, $class;
182 }
183
184 $self->{'filename_extension'} = "doc";
185 $self->{'file_type'} = "Word";
186
187 my $outhandle = $self->{'outhandle'};
188
189 if ($self->{'windows_scripting'}) {
190 $self->{'convert_options'} = "-windows_scripting";
191 $self->{'office_scripting'} = 1;
192 }
193 if ($self->{'openoffice_scripting'}) {
194 if ($self->{'windows_scripting'}) {
195 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
196 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
197 }
198 else {
199 $self->{'convert_options'} = "-openoffice_scripting";
200 $self->{'office_scripting'} = 1;
201 }
202 }
203
204 # check convert_to
205 if ($self->{'convert_to'} eq "auto") {
206 $self->{'convert_to'} = "html";
207 }
208 # windows or open office scripting, outputs structuredHTML
209 if (defined $self->{'office_scripting'}) {
210 $self->{'convert_to'} = "structuredhtml";
211 }
212
213 $self = bless $self, $class;
214 # if we use the merge inheritance above, need to bless self before calling a method.
215 # set convert_to_plugin and convert_to_ext
216 $self->set_standard_convert_settings();
217
218 my $secondary_plugin_name = $self->{'convert_to_plugin'};
219 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
220
221 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
222 $secondary_plugin_options->{$secondary_plugin_name} = [];
223 }
224 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
225
226 # following title_sub removes "Page 1" and a leading
227 # "1", which is often the page number at the top of the page. Bad Luck
228 # if your document title actually starts with "1 " - is there a better way?
229 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
230
231 my $associate_tail_re = $self->{'associate_tail_re'};
232 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
233 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
234 }
235 push(@$specific_options, "-file_rename_method", "none");
236
237 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
238 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
239 # to extract these metadata fields from the HEAD META fields
240 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
241 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
242 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
243 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
244 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
245 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
246 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
247 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
248 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
249 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
250 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
251 push(@$specific_options, "-processing_tmp_files");
252
253 }
254
255 elsif ($secondary_plugin_name eq "HTMLPlugin") {
256 push(@$specific_options, "-processing_tmp_files");
257 push(@$specific_options,"-input_encoding", "utf8");
258 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
259 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
260 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
261 # to extract these metadata fields from the HEAD META fields
262 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
263 }
264
265 #$self = bless $self, $class;
266 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
267
268 return bless $self;
269}
270
271sub init {
272 my $self = shift (@_);
273 my ($verbosity, $outhandle, $failhandle) = @_;
274
275 $self->SUPER::init($verbosity,$outhandle,$failhandle);
276 if ($openoffice_ext_working) {
277 $self->OpenOfficeConverter::init();
278 }
279}
280
281sub deinit {
282 # called only once, after all plugin passes have been done
283 my ($self) = @_;
284
285 if ($openoffice_ext_working) {
286 $self->OpenOfficeConverter::deinit();
287 }
288 $self->SUPER::deinit();
289}
290
291sub get_default_process_exp {
292 my $self = shift (@_);
293 if ($openoffice_ext_working) {
294 return q^(?i)\.(doc|dot|docx|odt)$^;
295 }
296 return q^(?i)\.(doc|dot)$^;
297}
298
299# if we are using open office, we use OpenOfficeConverter's convert method, otherwise fall back to ConvertBinaryFile method.
300sub tmp_area_convert_file {
301 my $self = shift (@_);
302 my ($output_ext, $input_filename, $textref) = @_;
303
304 if ($openoffice_ext_working && $self->{'openoffice_scripting'}) {
305 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
306 if ($result != 0) {
307 return $new_filename;
308 }
309 my $outhandle=$self->{'outhandle'};
310 print $outhandle "Open Office Conversion error\n";
311 print $outhandle $result_str;
312 return "";
313 }
314 else {
315 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
316 }
317}
318sub convert_post_process_old
319{
320 my $self = shift (@_);
321 my ($conv_filename) = @_;
322
323 my $outhandle=$self->{'outhandle'};
324
325 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
326
327 # read in file ($text will be in utf8)
328 my $text = "";
329 $self->read_file ($conv_filename, $encoding, $language, \$text);
330
331 # turn any high bytes that aren't valid utf-8 into utf-8.
332 #unicode::ensure_utf8(\$text);
333
334 # Write it out again!
335 #$self->utf8_write_file (\$text, $conv_filename);
336}
337
338# Modified to cache HTML files for efficieny reasons rather
339# than delete all. HTML is modified not to use IE's VML.
340# VML uses WML files, so these can be deleted.
341sub cleanup_tmp_area {
342 my ($self) = @_;
343 if (defined $self->{'files_dir'}) {
344 my $html_files_dir = $self->{'files_dir'};
345
346 if (opendir(DIN,$html_files_dir)) {
347 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
348 foreach my $f (@wmz_files) {
349 my $full_f = &util::filename_cat($html_files_dir,$f);
350 &util::rm($full_f);
351 }
352 closedir(DIN);
353 }
354 else {
355 # if HTML file has no supporting images, then no _files dir made
356 # => do nothing
357 }
358 }
359}
360
361
3621;
363
Note: See TracBrowser for help on using the repository browser.