source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 23484

Last change on this file since 23484 was 22894, checked in by kjdon, 14 years ago

added wpd (word perfect) extension into the list that can be processed by open office

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36sub BEGIN {
37 @WordPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
38}
39
40my $openoffice_available = 0;
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasePlugin.process_exp}",
45 'type' => "regexp",
46 'deft' => "&get_default_process_exp()", # delayed (see below)
47 'reqd' => "no" },
48 { 'name' => "description_tags",
49 'desc' => "{HTMLPlugin.description_tags}",
50 'type' => "flag" }
51 ];
52
53
54my $opt_windows_args = [ { 'name' => "windows_scripting",
55 'desc' => "{WordPlugin.windows_scripting}",
56 'type' => "flag",
57
58 'reqd' => "no" } ];
59
60my $opt_office_args = [ { 'name' => "metadata_fields",
61 'desc' => "{WordPlugin.metadata_fields}",
62 'type' => "string",
63 'deft' => "Title" },
64 { 'name' => "level1_header",
65 'desc' => "{StructuredHTMLPlugin.level1_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level2_header",
70 'desc' => "{StructuredHTMLPlugin.level2_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "level3_header",
75 'desc' => "{StructuredHTMLPlugin.level3_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "title_header",
80 'desc' => "{StructuredHTMLPlugin.title_header}",
81 'type' => "regexp",
82 'reqd' => "no",
83 'deft' => "" },
84 { 'name' => "delete_toc",
85 'desc' => "{StructuredHTMLPlugin.delete_toc}",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "toc_header",
89 'desc' => "{StructuredHTMLPlugin.toc_header}",
90 'type' => "regexp",
91 'reqd' => "no",
92 'deft' => "" } ];
93
94
95my $options = { 'name' => "WordPlugin",
96 'desc' => "{WordPlugin.desc}",
97 'abstract' => "no",
98 'inherits' => "yes",
99 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
100 'args' => $arguments };
101
102sub new {
103 my ($class) = shift (@_);
104 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
105 push(@$pluginlist, $class);
106
107 # this bit needs to happen later after the arguments array has been
108 # finished - used for parsing the input args.
109 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 # this one needs to go in first, to get the print info in the right order
111 push(@{$hashArgOptLists->{"OptList"}},$options);
112
113 my $office_capable = 0;
114 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
115 push(@$arguments,@$opt_windows_args);
116 $office_capable = 1;
117 }
118
119 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
120
121 if ($auto_converter_self->{'openoffice_available'}) {
122 $office_capable = 1;
123 $openoffice_available = 1;
124 }
125
126 # these office args apply to windows scripting or to openoffice conversion
127 if ($office_capable) {
128 push(@$arguments,@$opt_office_args);
129 }
130
131 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
132 foreach my $a (@$arguments) {
133 if ($a->{'name'} eq "process_exp") {
134 my $eval_expr = $a->{'deft'};
135 $a->{'deft'} = eval "$eval_expr";
136 last;
137 }
138 }
139
140 # have finished modifying our arguments, add them to ArgList
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142
143 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
144 my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
145
146 if ($self->{'info_only'}) {
147 # don't worry about any options etc
148 return bless $self, $class;
149 }
150
151 $self = bless $self, $class;
152 $self->{'file_type'} = "Word";
153
154 my $outhandle = $self->{'outhandle'};
155
156 if ($self->{'windows_scripting'}) {
157 $self->{'convert_options'} = "-windows_scripting";
158 $self->{'office_scripting'} = 1;
159 }
160 if ($self->{'openoffice_conversion'}) {
161 if ($self->{'windows_scripting'}) {
162 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
163 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
164 $self->{'openoffice_conversion'} = 0;
165 }
166 else {
167 $self->{'office_scripting'} = 1;
168 }
169 }
170
171 # check convert_to
172 if ($self->{'convert_to'} eq "auto") {
173 $self->{'convert_to'} = "html";
174 }
175 # windows or open office scripting, outputs structuredHTML
176 if (defined $self->{'office_scripting'}) {
177 $self->{'convert_to'} = "structuredhtml";
178 }
179
180 # set convert_to_plugin and convert_to_ext
181 $self->set_standard_convert_settings();
182
183 my $secondary_plugin_name = $self->{'convert_to_plugin'};
184 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
185
186 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
187 $secondary_plugin_options->{$secondary_plugin_name} = [];
188 }
189 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
190
191 # following title_sub removes "Page 1" and a leading
192 # "1", which is often the page number at the top of the page. Bad Luck
193 # if your document title actually starts with "1 " - is there a better way?
194 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
195
196 my $associate_tail_re = $self->{'associate_tail_re'};
197 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
198 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
199 }
200 push(@$specific_options, "-file_rename_method", "none");
201
202 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
203 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
204 # to extract these metadata fields from the HEAD META fields
205 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
206 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
207 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
208 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
209 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
210 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
211 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
212 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
213 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
214 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
215 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
216 push(@$specific_options, "-processing_tmp_files");
217
218 }
219
220 elsif ($secondary_plugin_name eq "HTMLPlugin") {
221 push(@$specific_options, "-processing_tmp_files");
222 push(@$specific_options,"-input_encoding", "utf8");
223 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
224 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
225 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
226 # to extract these metadata fields from the HEAD META fields
227 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
228 }
229
230 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
231
232 return $self;
233}
234
235sub get_default_process_exp {
236 my $self = shift (@_);
237
238 if ($openoffice_available) {
239 return q^(?i)\.(doc|dot|docx|odt|wpd)$^;
240 }
241 return q^(?i)\.(doc|dot)$^;
242}
243
244sub init {
245 my $self = shift (@_);
246
247 # ConvertBinaryFile init
248 $self->SUPER::init(@_);
249 $self->AutoLoadConverters::init();
250
251}
252
253sub begin {
254 my $self = shift (@_);
255
256 $self->AutoLoadConverters::begin();
257 $self->SUPER::begin(@_);
258
259}
260
261sub deinit {
262 my $self = shift (@_);
263
264 $self->AutoLoadConverters::deinit();
265 $self->SUPER::deinit(@_);
266
267}
268
269sub tmp_area_convert_file {
270
271 my $self = shift (@_);
272 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
273
274}
275
276
277sub convert_post_process_old
278{
279 my $self = shift (@_);
280 my ($conv_filename) = @_;
281
282 my $outhandle=$self->{'outhandle'};
283
284 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
285
286 # read in file ($text will be in utf8)
287 my $text = "";
288 $self->read_file ($conv_filename, $encoding, $language, \$text);
289
290 # turn any high bytes that aren't valid utf-8 into utf-8.
291 #unicode::ensure_utf8(\$text);
292
293 # Write it out again!
294 #$self->utf8_write_file (\$text, $conv_filename);
295}
296
297# Modified to cache HTML files for efficieny reasons rather
298# than delete all. HTML is modified not to use IE's VML.
299# VML uses WML files, so these can be deleted.
300sub cleanup_tmp_area {
301 my ($self) = @_;
302 if (defined $self->{'files_dir'}) {
303 my $html_files_dir = $self->{'files_dir'};
304
305 if (opendir(DIN,$html_files_dir)) {
306 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
307 foreach my $f (@wmz_files) {
308 my $full_f = &util::filename_cat($html_files_dir,$f);
309 &util::rm($full_f);
310 }
311 closedir(DIN);
312 }
313 else {
314 # if HTML file has no supporting images, then no _files dir made
315 # => do nothing
316 }
317 }
318}
319
320
3211;
322
Note: See TracBrowser for help on using the repository browser.