source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22611

Last change on this file since 22611 was 22611, checked in by kjdon, 14 years ago

now uses OpenOfficeConverter that is not ConvertBinaryFile

  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
RevLine 
[1410]1###########################################################################
2#
[15872]3# WordPlugin.pm -- plugin for importing Microsoft Word documents
[1410]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[15872]25package WordPlugin;
[1410]26
[15872]27use ConvertBinaryFile;
[22428]28
29
[10254]30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
[22507]32no strict 'subs';
33use gsprintf 'gsprintf';
[1410]34
[22428]35# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile
36
[22507]37# do not initialise these variables
38my $openoffice_ext_installed;
39my $openoffice_ext_working;
40sub BEGIN {
41 eval("require OpenOfficeConverter");
42 if ($@) {
[22597]43 # Useful debugging statement if there is a syntax error in OpenOfficeConverter:
[22611]44 print STDERR "$@\n";
45 #@WordPlugin::ISA = ('ConvertBinaryFile');
[22507]46 $openoffice_ext_installed = 0;
47 $openoffice_ext_working = 0;
48 }
49 else {
50 # Successfully found
51 $openoffice_ext_installed = 1;
52 # now check whether it can run soffice
53 if ($OpenOfficeConverter::openoffice_conversion_available) {
[22611]54 #@WordPlugin::ISA = ('OpenOfficeConverter');
[22507]55 $openoffice_ext_working = 1;
56
57 } else {
[22611]58 #@WordPlugin::ISA = ('ConvertBinaryFile');
[22507]59 $openoffice_ext_working = 0;
60 }
61 }
[22611]62
63 if ($openoffice_ext_working) {
64 @WordPlugin::ISA = ('ConvertBinaryFile', 'OpenOfficeConverter');
65 } else {
66 @WordPlugin::ISA = ('ConvertBinaryFile');
67 }
[22507]68}
69
[4744]70my $arguments =
71 [ { 'name' => "process_exp",
[15872]72 'desc' => "{BasePlugin.process_exp}",
[6408]73 'type' => "regexp",
[4744]74 'deft' => &get_default_process_exp(),
[10514]75 'reqd' => "no" },
76 { 'name' => "description_tags",
[15872]77 'desc' => "{HTMLPlugin.description_tags}",
[10514]78 'type' => "flag" }
[10355]79 ];
[3540]80
[22428]81
82my $opt_windows_args = [ { 'name' => "windows_scripting",
83 'desc' => "{WordPlugin.windows_scripting}",
84 'type' => "flag",
85 'reqd' => "no" } ];
86
[22505]87my $opt_openoffice_args =
88 [ { 'name' => "openoffice_scripting",
[22514]89 'desc' => "{OpenOfficeConverter.openoffice_scripting}",
[22505]90 'type' => "flag",
91 'reqd' => "no" } ];
92
[22428]93my $opt_office_args = [ { 'name' => "metadata_fields",
94 'desc' => "{WordPlugin.metadata_fields}",
95 'type' => "string",
96 'deft' => "Title" },
97 { 'name' => "level1_header",
98 'desc' => "{StructuredHTMLPlugin.level1_header}",
99 'type' => "regexp",
100 'reqd' => "no",
101 'deft' => "" },
102 { 'name' => "level2_header",
103 'desc' => "{StructuredHTMLPlugin.level2_header}",
104 'type' => "regexp",
105 'reqd' => "no",
106 'deft' => "" },
107 { 'name' => "level3_header",
108 'desc' => "{StructuredHTMLPlugin.level3_header}",
109 'type' => "regexp",
110 'reqd' => "no",
111 'deft' => "" },
112 { 'name' => "title_header",
113 'desc' => "{StructuredHTMLPlugin.title_header}",
114 'type' => "regexp",
115 'reqd' => "no",
116 'deft' => "" },
117 { 'name' => "delete_toc",
118 'desc' => "{StructuredHTMLPlugin.delete_toc}",
119 'type' => "flag",
120 'reqd' => "no" },
121 { 'name' => "toc_header",
122 'desc' => "{StructuredHTMLPlugin.toc_header}",
123 'type' => "regexp",
124 'reqd' => "no",
125 'deft' => "" } ];
126
127
[15872]128my $options = { 'name' => "WordPlugin",
129 'desc' => "{WordPlugin.desc}",
[6408]130 'abstract' => "no",
[4744]131 'inherits' => "yes",
[15114]132 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
[4744]133 'args' => $arguments };
[3540]134
[2811]135sub new {
[10218]136 my ($class) = shift (@_);
137 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
138 push(@$pluginlist, $class);
[3540]139
[22507]140 if ($openoffice_ext_installed) {
[22428]141 print STDERR "WordPlugin: OpenOffice Extension to Greenstone detected\n";
[22507]142 if ($openoffice_ext_working) {
143 print STDERR "... and it appears to be working\n";
144 } else {
145 print STDERR "... but it appears to be broken\n";
146 &gsprintf(STDERR, "OpenOfficeConverter: {OpenOfficeConverter.noconversionavailable} ({OpenOfficeConverter.$OpenOfficeConverter::no_openoffice_conversion_reason})\n");
147 }
[22428]148 }
[22507]149
150 my $office_capable = 0;
[10279]151 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[22428]152 push(@$arguments,@$opt_windows_args);
[22507]153 $office_capable = 1;
[10279]154 }
[22507]155 if ($openoffice_ext_working) {
[22505]156 push(@$arguments,@$opt_openoffice_args);
[22507]157 $office_capable = 1;
158 }
159 # these office args apply to windows scripting or to openoffice scripting
160 if ($office_capable) {
[22428]161 push(@$arguments,@$opt_office_args);
162 }
[22507]163
[15872]164 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
165 push(@{$hashArgOptLists->{"OptList"}},$options);
[10441]166
[22428]167 my $self = {};
[22507]168 if ($openoffice_ext_working) {
169
[22611]170 #$self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists);
171 my $ooc_self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists);
172 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
173 $self = BasePlugin::merge_inheritance($ooc_self, $cbf_self);
[22428]174 }
175 else {
176 $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[22611]177 }
[22428]178
[10580]179 if ($self->{'info_only'}) {
180 # don't worry about any options etc
181 return bless $self, $class;
182 }
183
[15872]184 $self->{'filename_extension'} = "doc";
185 $self->{'file_type'} = "Word";
186
[22597]187 my $outhandle = $self->{'outhandle'};
188
[22428]189 if ($self->{'windows_scripting'}) {
190 $self->{'convert_options'} = "-windows_scripting";
191 $self->{'office_scripting'} = 1;
192 }
193 if ($self->{'openoffice_scripting'}) {
194 if ($self->{'windows_scripting'}) {
195 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
196 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
197 }
198 else {
199 $self->{'convert_options'} = "-openoffice_scripting";
200 $self->{'office_scripting'} = 1;
201 }
202 }
[10405]203
[22597]204 # check convert_to
205 if ($self->{'convert_to'} eq "auto") {
206 $self->{'convert_to'} = "html";
207 }
208 # windows or open office scripting, outputs structuredHTML
209 if (defined $self->{'office_scripting'}) {
210 $self->{'convert_to'} = "structuredhtml";
211 }
[12834]212
[22611]213 $self = bless $self, $class;
214 # if we use the merge inheritance above, need to bless self before calling a method.
[22597]215 # set convert_to_plugin and convert_to_ext
[22611]216 $self->set_standard_convert_settings();
[22597]217
218 my $secondary_plugin_name = $self->{'convert_to_plugin'};
[10279]219 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[22597]220
221 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
222 $secondary_plugin_options->{$secondary_plugin_name} = [];
[10405]223 }
[22597]224 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10279]225
[22597]226 # following title_sub removes "Page 1" and a leading
227 # "1", which is often the page number at the top of the page. Bad Luck
228 # if your document title actually starts with "1 " - is there a better way?
229 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]230
231 my $associate_tail_re = $self->{'associate_tail_re'};
232 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[22597]233 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
[11122]234 }
[22597]235 push(@$specific_options, "-file_rename_method", "none");
[18406]236
[22597]237 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
238 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
239 # to extract these metadata fields from the HEAD META fields
240 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
241 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
242 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
243 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
244 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
245 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
246 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
247 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
248 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
249 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
250 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
251 push(@$specific_options, "-processing_tmp_files");
252
253 }
254
255 elsif ($secondary_plugin_name eq "HTMLPlugin") {
256 push(@$specific_options, "-processing_tmp_files");
257 push(@$specific_options,"-input_encoding", "utf8");
258 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
259 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
260 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
261 # to extract these metadata fields from the HEAD META fields
262 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
263 }
[18406]264
[22611]265 #$self = bless $self, $class;
[10428]266 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[2811]267
[10279]268 return bless $self;
[2811]269}
270
[22428]271sub init {
272 my $self = shift (@_);
273 my ($verbosity, $outhandle, $failhandle) = @_;
274
275 $self->SUPER::init($verbosity,$outhandle,$failhandle);
[22611]276 if ($openoffice_ext_working) {
277 $self->OpenOfficeConverter::init();
278 }
[22428]279}
280
281sub deinit {
282 # called only once, after all plugin passes have been done
283 my ($self) = @_;
284
[22611]285 if ($openoffice_ext_working) {
286 $self->OpenOfficeConverter::deinit();
287 }
[22428]288 $self->SUPER::deinit();
289}
290
[1410]291sub get_default_process_exp {
292 my $self = shift (@_);
[22507]293 if ($openoffice_ext_working) {
294 return q^(?i)\.(doc|dot|docx|odt)$^;
295 }
[3400]296 return q^(?i)\.(doc|dot)$^;
[1410]297}
298
[22611]299# if we are using open office, we use OpenOfficeConverter's convert method, otherwise fall back to ConvertBinaryFile method.
300sub tmp_area_convert_file {
301 my $self = shift (@_);
302 my ($output_ext, $input_filename, $textref) = @_;
303
304 if ($openoffice_ext_working && $self->{'openoffice_scripting'}) {
305 my ($result, $result_str, $new_filename) = $self->OpenOfficeConverter::convert($input_filename, $output_ext);
306 if ($result != 0) {
307 return $new_filename;
308 }
309 my $outhandle=$self->{'outhandle'};
310 print $outhandle "Open Office Conversion error\n";
311 print $outhandle $result_str;
312 return "";
313 }
314 else {
315 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
316 }
317}
[15872]318sub convert_post_process_old
[10279]319{
[1410]320 my $self = shift (@_);
[10279]321 my ($conv_filename) = @_;
[2515]322
[10279]323 my $outhandle=$self->{'outhandle'};
[10441]324
[10279]325 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
326
327 # read in file ($text will be in utf8)
328 my $text = "";
329 $self->read_file ($conv_filename, $encoding, $language, \$text);
330
331 # turn any high bytes that aren't valid utf-8 into utf-8.
[10441]332 #unicode::ensure_utf8(\$text);
333
[10279]334 # Write it out again!
[10441]335 #$self->utf8_write_file (\$text, $conv_filename);
[1410]336}
337
[10279]338# Modified to cache HTML files for efficieny reasons rather
339# than delete all. HTML is modified not to use IE's VML.
340# VML uses WML files, so these can be deleted.
341sub cleanup_tmp_area {
342 my ($self) = @_;
343 if (defined $self->{'files_dir'}) {
344 my $html_files_dir = $self->{'files_dir'};
345
346 if (opendir(DIN,$html_files_dir)) {
347 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
348 foreach my $f (@wmz_files) {
349 my $full_f = &util::filename_cat($html_files_dir,$f);
350 &util::rm($full_f);
351 }
352 closedir(DIN);
353 }
354 else {
355 # if HTML file has no supporting images, then no _files dir made
356 # => do nothing
357 }
358 }
359}
360
[10441]361
[1410]3621;
[10279]363
Note: See TracBrowser for help on using the repository browser.