source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22861

Last change on this file since 22861 was 22861, checked in by kjdon, 14 years ago

now uses new AutoLoadConverters instead of AutoloadConverterScripting. This doesn't inherit from ConvertBinaryFile, so these plugins all inherit from that again. Now we can initialise the converters, fix up the modifications to the arguments, before parsing them when we do new ConvertBinaryFile. PowerPointPlugin incomplete and still needs lots of work done for processing the result on open office conversion

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36sub BEGIN {
37 @WordPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
38}
39
40my $openoffice_available = 0;
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasePlugin.process_exp}",
45 'type' => "regexp",
46 'deft' => "&get_default_process_exp()", # delayed (see below)
47 'reqd' => "no" },
48 { 'name' => "description_tags",
49 'desc' => "{HTMLPlugin.description_tags}",
50 'type' => "flag" }
51 ];
52
53
54my $opt_windows_args = [ { 'name' => "windows_scripting",
55 'desc' => "{WordPlugin.windows_scripting}",
56 'type' => "flag",
57
58 'reqd' => "no" } ];
59
60my $opt_office_args = [ { 'name' => "metadata_fields",
61 'desc' => "{WordPlugin.metadata_fields}",
62 'type' => "string",
63 'deft' => "Title" },
64 { 'name' => "level1_header",
65 'desc' => "{StructuredHTMLPlugin.level1_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level2_header",
70 'desc' => "{StructuredHTMLPlugin.level2_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "level3_header",
75 'desc' => "{StructuredHTMLPlugin.level3_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "title_header",
80 'desc' => "{StructuredHTMLPlugin.title_header}",
81 'type' => "regexp",
82 'reqd' => "no",
83 'deft' => "" },
84 { 'name' => "delete_toc",
85 'desc' => "{StructuredHTMLPlugin.delete_toc}",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "toc_header",
89 'desc' => "{StructuredHTMLPlugin.toc_header}",
90 'type' => "regexp",
91 'reqd' => "no",
92 'deft' => "" } ];
93
94
95my $options = { 'name' => "WordPlugin",
96 'desc' => "{WordPlugin.desc}",
97 'abstract' => "no",
98 'inherits' => "yes",
99 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
100 'args' => $arguments };
101
102sub new {
103 my ($class) = shift (@_);
104 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
105 push(@$pluginlist, $class);
106
107 # this bit needs to happen later after the arguments array has been
108 # finished - used for parsing the input args.
109 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 # this one needs to go in first, to get the print info in the right order
111 push(@{$hashArgOptLists->{"OptList"}},$options);
112
113 my $office_capable = 0;
114 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
115 push(@$arguments,@$opt_windows_args);
116 $office_capable = 1;
117 }
118
119 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
120
121 if ($auto_converter_self->{'openoffice_available'}) {
122 $office_capable = 1;
123 $openoffice_available = 1;
124 }
125
126 # these office args apply to windows scripting or to openoffice conversion
127 if ($office_capable) {
128 push(@$arguments,@$opt_office_args);
129 }
130
131 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
132 foreach my $a (@$arguments) {
133 if ($a->{'name'} eq "process_exp") {
134 my $eval_expr = $a->{'deft'};
135 $a->{'deft'} = eval "$eval_expr";
136 last;
137 }
138 }
139
140 # have finished modifying our arguments, add them to ArgList
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142
143 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
144 my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
145
146 if ($self->{'info_only'}) {
147 # don't worry about any options etc
148 return bless $self, $class;
149 }
150
151 $self = bless $self, $class;
152 $self->{'filename_extension'} = "doc";
153 $self->{'file_type'} = "Word";
154
155 my $outhandle = $self->{'outhandle'};
156
157 if ($self->{'windows_scripting'}) {
158 $self->{'convert_options'} = "-windows_scripting";
159 $self->{'office_scripting'} = 1;
160 }
161 if ($self->{'openoffice_conversion'}) {
162 if ($self->{'windows_scripting'}) {
163 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
164 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
165 $self->{'openoffice_conversion'} = 0;
166 }
167 else {
168 $self->{'office_scripting'} = 1;
169 }
170 }
171
172 # check convert_to
173 if ($self->{'convert_to'} eq "auto") {
174 $self->{'convert_to'} = "html";
175 }
176 # windows or open office scripting, outputs structuredHTML
177 if (defined $self->{'office_scripting'}) {
178 $self->{'convert_to'} = "structuredhtml";
179 }
180
181 # set convert_to_plugin and convert_to_ext
182 $self->set_standard_convert_settings();
183
184 my $secondary_plugin_name = $self->{'convert_to_plugin'};
185 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
186
187 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
188 $secondary_plugin_options->{$secondary_plugin_name} = [];
189 }
190 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
191
192 # following title_sub removes "Page 1" and a leading
193 # "1", which is often the page number at the top of the page. Bad Luck
194 # if your document title actually starts with "1 " - is there a better way?
195 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
196
197 my $associate_tail_re = $self->{'associate_tail_re'};
198 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
199 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
200 }
201 push(@$specific_options, "-file_rename_method", "none");
202
203 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
204 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
205 # to extract these metadata fields from the HEAD META fields
206 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
207 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
208 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
209 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
210 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
211 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
212 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
213 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
214 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
215 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
216 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
217 push(@$specific_options, "-processing_tmp_files");
218
219 }
220
221 elsif ($secondary_plugin_name eq "HTMLPlugin") {
222 push(@$specific_options, "-processing_tmp_files");
223 push(@$specific_options,"-input_encoding", "utf8");
224 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
225 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
226 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
227 # to extract these metadata fields from the HEAD META fields
228 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
229 }
230
231 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
232
233 return $self;
234}
235
236sub get_default_process_exp {
237 my $self = shift (@_);
238
239 if ($openoffice_available) {
240 return q^(?i)\.(doc|dot|docx|odt)$^;
241 }
242 return q^(?i)\.(doc|dot)$^;
243}
244
245sub init {
246 my $self = shift (@_);
247
248 # ConvertBinaryFile init
249 $self->SUPER::init(@_);
250 $self->AutoLoadConverters::init();
251
252}
253
254sub begin {
255 my $self = shift (@_);
256
257 $self->AutoLoadConverters::begin();
258 $self->SUPER::begin(@_);
259
260}
261
262sub deinit {
263 my $self = shift (@_);
264
265 $self->AutoLoadConverters::deinit();
266 $self->SUPER::deinit(@_);
267
268}
269
270sub tmp_area_convert_file {
271
272 my $self = shift (@_);
273 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
274
275}
276
277
278sub convert_post_process_old
279{
280 my $self = shift (@_);
281 my ($conv_filename) = @_;
282
283 my $outhandle=$self->{'outhandle'};
284
285 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
286
287 # read in file ($text will be in utf8)
288 my $text = "";
289 $self->read_file ($conv_filename, $encoding, $language, \$text);
290
291 # turn any high bytes that aren't valid utf-8 into utf-8.
292 #unicode::ensure_utf8(\$text);
293
294 # Write it out again!
295 #$self->utf8_write_file (\$text, $conv_filename);
296}
297
298# Modified to cache HTML files for efficieny reasons rather
299# than delete all. HTML is modified not to use IE's VML.
300# VML uses WML files, so these can be deleted.
301sub cleanup_tmp_area {
302 my ($self) = @_;
303 if (defined $self->{'files_dir'}) {
304 my $html_files_dir = $self->{'files_dir'};
305
306 if (opendir(DIN,$html_files_dir)) {
307 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
308 foreach my $f (@wmz_files) {
309 my $full_f = &util::filename_cat($html_files_dir,$f);
310 &util::rm($full_f);
311 }
312 closedir(DIN);
313 }
314 else {
315 # if HTML file has no supporting images, then no _files dir made
316 # => do nothing
317 }
318 }
319}
320
321
3221;
323
Note: See TracBrowser for help on using the repository browser.