source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22505

Last change on this file since 22505 was 22505, checked in by kjdon, 14 years ago

added the openoffice_scripting arg here instead of in OpenOfficeConverter

  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use ConvertBinaryFile;
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33
34# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile
35
36my $arguments =
37 [ { 'name' => "process_exp",
38 'desc' => "{BasePlugin.process_exp}",
39 'type' => "regexp",
40 'deft' => &get_default_process_exp(),
41 'reqd' => "no" },
42 { 'name' => "description_tags",
43 'desc' => "{HTMLPlugin.description_tags}",
44 'type' => "flag" }
45 ];
46
47
48my $opt_windows_args = [ { 'name' => "windows_scripting",
49 'desc' => "{WordPlugin.windows_scripting}",
50 'type' => "flag",
51 'reqd' => "no" } ];
52
53my $opt_openoffice_args =
54 [ { 'name' => "openoffice_scripting",
55 'desc' => "{WordPlugin.openoffice_scripting}",
56 'type' => "flag",
57 'reqd' => "no" } ];
58
59my $opt_office_args = [ { 'name' => "metadata_fields",
60 'desc' => "{WordPlugin.metadata_fields}",
61 'type' => "string",
62 'deft' => "Title" },
63 { 'name' => "level1_header",
64 'desc' => "{StructuredHTMLPlugin.level1_header}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => "" },
68 { 'name' => "level2_header",
69 'desc' => "{StructuredHTMLPlugin.level2_header}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => "" },
73 { 'name' => "level3_header",
74 'desc' => "{StructuredHTMLPlugin.level3_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" },
78 { 'name' => "title_header",
79 'desc' => "{StructuredHTMLPlugin.title_header}",
80 'type' => "regexp",
81 'reqd' => "no",
82 'deft' => "" },
83 { 'name' => "delete_toc",
84 'desc' => "{StructuredHTMLPlugin.delete_toc}",
85 'type' => "flag",
86 'reqd' => "no" },
87 { 'name' => "toc_header",
88 'desc' => "{StructuredHTMLPlugin.toc_header}",
89 'type' => "regexp",
90 'reqd' => "no",
91 'deft' => "" } ];
92
93
94my $options = { 'name' => "WordPlugin",
95 'desc' => "{WordPlugin.desc}",
96 'abstract' => "no",
97 'inherits' => "yes",
98 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
99 'args' => $arguments };
100
101sub new {
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 my $openoffice_ext_installed;
107 eval("require OpenOfficeConverter");
108 if ($@) {
109 # Useful debugging statement if there is a syntax error in OpenOfficeConverter
110 #print STDERR "$@\n";
111
112 push(@WordPlugin::ISA,"ConvertBinaryFile");
113 $openoffice_ext_installed = 0;
114 }
115 else {
116 # Successfully found
117 print STDERR "WordPlugin: OpenOffice Extension to Greenstone detected\n";
118 push(@WordPlugin::ISA,"OpenOfficeConverter");
119 $openoffice_ext_installed = 1;
120 }
121
122 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
123 push(@$arguments,@$opt_windows_args);
124 push(@$arguments,@$opt_office_args);
125 }
126 elsif ($openoffice_ext_installed) {
127 push(@$arguments,@$opt_openoffice_args);
128 push(@$arguments,@$opt_office_args);
129 }
130
131 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
132 push(@{$hashArgOptLists->{"OptList"}},$options);
133
134 my $self = {};
135 my $outhandle;
136
137 if ($openoffice_ext_installed) {
138 $self = new OpenOfficeConverter($pluginlist, $inputargs, $hashArgOptLists);
139 $outhandle = $self->{'outhandle'};
140
141 if ($self->{'openoffice_conversion_available'}) {
142 print $outhandle "WordPlugin: OpenOffice scripting functionality available\n";
143
144 # Override default process expression
145 $self->{'process_exp'} = q^(?i)\.(doc|dot|docx|odt)$^;
146
147 if ($self->{'openoffice_scripting'}) {
148 print $outhandle "WordPlugin: Activating OpenOffice scripting functionality\n";
149 }
150 }
151 else {
152 print $outhandle "WordPlugin: Unable to run 'soffice'\n";
153 print $outhandle "WordPlugin: Defaulting to ConvertBinaryFile inheritence\n";
154 }
155 }
156 else {
157 $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
158 $outhandle = $self->{'outhandle'};
159 }
160
161 if ($self->{'info_only'}) {
162 # don't worry about any options etc
163 return bless $self, $class;
164 }
165
166 $self->{'filename_extension'} = "doc";
167 $self->{'file_type'} = "Word";
168
169 if ($self->{'windows_scripting'}) {
170 $self->{'convert_options'} = "-windows_scripting";
171 $self->{'office_scripting'} = 1;
172 }
173 if ($self->{'openoffice_scripting'}) {
174 if ($self->{'windows_scripting'}) {
175 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
176 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
177 }
178 else {
179 $self->{'convert_options'} = "-openoffice_scripting";
180 $self->{'office_scripting'} = 1;
181 }
182 }
183
184 # we always save as utf-8
185# if ($self->{'input_encoding'} eq "auto") {
186# $self->{'input_encoding'} = "utf8";
187# }
188
189 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
190 if (defined $self->{'office_scripting'}) {
191 if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
192 $secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
193 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
194
195 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
196 # to extract these metadata fields from the HEAD META fields
197 push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
198 push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
199 push (@$structhtml_options, "-description_tags") if $self->{'office_scripting'};
200 push (@$structhtml_options, "-extract_language") if $self->{'extract_language'};
201 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
202 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
203 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
204 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
205 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
206 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
207 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
208 push (@$structhtml_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
209 }
210 }
211 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
212 $secondary_plugin_options->{'HTMLPlugin'} = [];
213 }
214 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
215 $secondary_plugin_options->{'TextPlugin'} = [];
216 }
217
218 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
219 my $text_options = $secondary_plugin_options->{'TextPlugin'};
220 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
221 # tell the secondary plugins that they are processing tmp files
222 push(@$html_options, "-processing_tmp_files");
223 push(@$structhtml_options, "-processing_tmp_files");
224
225 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
226 push(@$html_options,"-input_encoding", "utf8");
227 push(@$html_options,"-extract_language") if $self->{'extract_language'};
228 push(@$html_options, "-description_tags") if $self->{'description_tags'};
229
230 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
231 # to extract these metadata fields from the HEAD META fields
232 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
233 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
234
235 my $associate_tail_re = $self->{'associate_tail_re'};
236 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
237 push(@$html_options, "-associate_tail_re", $associate_tail_re);
238 push(@$text_options, "-associate_tail_re", $associate_tail_re);
239 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options;
240 }
241
242 push(@$html_options, "-file_rename_method", "none");
243 push(@$text_options, "-file_rename_method", "none");
244 push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options;
245
246 $self = bless $self, $class;
247 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
248
249 return bless $self;
250}
251
252sub init {
253 my $self = shift (@_);
254 my ($verbosity, $outhandle, $failhandle) = @_;
255
256 $self->SUPER::init($verbosity,$outhandle,$failhandle);
257}
258
259sub deinit {
260 # called only once, after all plugin passes have been done
261 my ($self) = @_;
262
263 $self->SUPER::deinit();
264}
265
266sub get_default_process_exp {
267 my $self = shift (@_);
268
269 return q^(?i)\.(doc|dot)$^;
270}
271
272sub convert_post_process_old
273{
274 my $self = shift (@_);
275 my ($conv_filename) = @_;
276
277 my $outhandle=$self->{'outhandle'};
278
279 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
280
281 # read in file ($text will be in utf8)
282 my $text = "";
283 $self->read_file ($conv_filename, $encoding, $language, \$text);
284
285 # turn any high bytes that aren't valid utf-8 into utf-8.
286 #unicode::ensure_utf8(\$text);
287
288 # Write it out again!
289 #$self->utf8_write_file (\$text, $conv_filename);
290}
291
292# Modified to cache HTML files for efficieny reasons rather
293# than delete all. HTML is modified not to use IE's VML.
294# VML uses WML files, so these can be deleted.
295sub cleanup_tmp_area {
296 my ($self) = @_;
297 if (defined $self->{'files_dir'}) {
298 my $html_files_dir = $self->{'files_dir'};
299
300 if (opendir(DIN,$html_files_dir)) {
301 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
302 foreach my $f (@wmz_files) {
303 my $full_f = &util::filename_cat($html_files_dir,$f);
304 &util::rm($full_f);
305 }
306 closedir(DIN);
307 }
308 else {
309 # if HTML file has no supporting images, then no _files dir made
310 # => do nothing
311 }
312 }
313}
314
315
3161;
317
Note: See TracBrowser for help on using the repository browser.