source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22709

Last change on this file since 22709 was 22709, checked in by davidb, 14 years ago

Fixed up -process_exp so it now dynamically configures itself properly, based on whether OpenOfficeConverter is found or not

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoloadConverterScripting;
34
35@WordPlugin::ISA = ('AutoloadConverterScripting');
36
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasePlugin.process_exp}",
41 'type' => "regexp",
42 'deft' => "&get_default_process_exp(\$self)", # delayed (see below)
43 'reqd' => "no" },
44 { 'name' => "description_tags",
45 'desc' => "{HTMLPlugin.description_tags}",
46 'type' => "flag" }
47 ];
48
49
50my $opt_windows_args = [ { 'name' => "windows_scripting",
51 'desc' => "{WordPlugin.windows_scripting}",
52 'type' => "flag",
53 'reqd' => "no" } ];
54
55my $opt_office_args = [ { 'name' => "metadata_fields",
56 'desc' => "{WordPlugin.metadata_fields}",
57 'type' => "string",
58 'deft' => "Title" },
59 { 'name' => "level1_header",
60 'desc' => "{StructuredHTMLPlugin.level1_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "level2_header",
65 'desc' => "{StructuredHTMLPlugin.level2_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level3_header",
70 'desc' => "{StructuredHTMLPlugin.level3_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "title_header",
75 'desc' => "{StructuredHTMLPlugin.title_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "delete_toc",
80 'desc' => "{StructuredHTMLPlugin.delete_toc}",
81 'type' => "flag",
82 'reqd' => "no" },
83 { 'name' => "toc_header",
84 'desc' => "{StructuredHTMLPlugin.toc_header}",
85 'type' => "regexp",
86 'reqd' => "no",
87 'deft' => "" } ];
88
89
90my $options = { 'name' => "WordPlugin",
91 'desc' => "{WordPlugin.desc}",
92 'abstract' => "no",
93 'inherits' => "yes",
94 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
95 'args' => $arguments };
96
97sub new {
98 my ($class) = shift (@_);
99 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100 push(@$pluginlist, $class);
101
102 my $office_capable = 0;
103 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
104 push(@$arguments,@$opt_windows_args);
105 $office_capable = 1;
106 }
107 if ($AutoloadConverterScripting::openoffice_ext_working) {
108 $office_capable = 1;
109 }
110 # these office args apply to windows scripting or to openoffice scripting
111 if ($office_capable) {
112 push(@$arguments,@$opt_office_args);
113 }
114
115 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
116 push(@{$hashArgOptLists->{"OptList"}},$options);
117
118 my $self
119 = new AutoloadConverterScripting("OpenOfficeConverter",$pluginlist,
120 $inputargs, $hashArgOptLists);
121
122 # plugin's process_exp can only be correctly determined once autoloading
123 # has taken place
124 my $plug_options = $self->{'option_list'}->[0];
125 my $plug_args = $plug_options->{'args'};
126
127 foreach my $a (@$plug_args) {
128 # consider changing this to search for all values that are
129 # tagged as 'deft-delayed' = 1 ?!?
130
131 if ($a->{'name'} eq "process_exp") {
132 my $eval_expr = $a->{'deft'};
133 $a->{'deft'} = eval "$eval_expr";
134
135 # Now see if process_exp needs updating
136 my $process_exp = $self->{'process_exp'};
137 if (!$self->{'info_only'} && ($process_exp eq $eval_expr)) {
138 # process_exp is only defined if not 'info_only'
139 #
140 # if it does exist and it equals the unevaluated $eval_expr
141 # then it was set to the default (rather than overriden by
142 # the collect.cfg file)
143
144 $self->{'process_exp'} = $a->{'deft'};
145 }
146 }
147 }
148
149
150 if ($self->{'info_only'}) {
151 # don't worry about any options etc
152 return bless $self, $class;
153 }
154
155 $self->{'filename_extension'} = "doc";
156 $self->{'file_type'} = "Word";
157
158 my $outhandle = $self->{'outhandle'};
159
160 if ($self->{'windows_scripting'}) {
161 $self->{'convert_options'} = "-windows_scripting";
162 $self->{'office_scripting'} = 1;
163 }
164 if ($self->{'openoffice_scripting'}) {
165 if ($self->{'windows_scripting'}) {
166 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
167 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
168 $self->{'openoffice_scripting'} = 0;
169 }
170 else {
171 $self->{'office_scripting'} = 1;
172 }
173 }
174
175 # check convert_to
176 if ($self->{'convert_to'} eq "auto") {
177 $self->{'convert_to'} = "html";
178 }
179 # windows or open office scripting, outputs structuredHTML
180 if (defined $self->{'office_scripting'}) {
181 $self->{'convert_to'} = "structuredhtml";
182 }
183
184 # set convert_to_plugin and convert_to_ext
185 $self->set_standard_convert_settings();
186
187 my $secondary_plugin_name = $self->{'convert_to_plugin'};
188 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
189
190 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
191 $secondary_plugin_options->{$secondary_plugin_name} = [];
192 }
193 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
194
195 # following title_sub removes "Page 1" and a leading
196 # "1", which is often the page number at the top of the page. Bad Luck
197 # if your document title actually starts with "1 " - is there a better way?
198 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
199
200 my $associate_tail_re = $self->{'associate_tail_re'};
201 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
202 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
203 }
204 push(@$specific_options, "-file_rename_method", "none");
205
206 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
207 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
208 # to extract these metadata fields from the HEAD META fields
209 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
210 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
211 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
212 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
213 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
214 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
215 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
216 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
217 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
218 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
219 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
220 push(@$specific_options, "-processing_tmp_files");
221
222 }
223
224 elsif ($secondary_plugin_name eq "HTMLPlugin") {
225 push(@$specific_options, "-processing_tmp_files");
226 push(@$specific_options,"-input_encoding", "utf8");
227 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
228 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
229 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
230 # to extract these metadata fields from the HEAD META fields
231 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
232 }
233
234 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
235
236 return bless $self, $class;
237}
238
239sub get_default_process_exp {
240 my $self = shift (@_);
241 if ($self->{'scripting_ext_working'}) {
242 return q^(?i)\.(doc|dot|docx|odt)$^;
243 }
244 return q^(?i)\.(doc|dot)$^;
245}
246
247sub convert_post_process_old
248{
249 my $self = shift (@_);
250 my ($conv_filename) = @_;
251
252 my $outhandle=$self->{'outhandle'};
253
254 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
255
256 # read in file ($text will be in utf8)
257 my $text = "";
258 $self->read_file ($conv_filename, $encoding, $language, \$text);
259
260 # turn any high bytes that aren't valid utf-8 into utf-8.
261 #unicode::ensure_utf8(\$text);
262
263 # Write it out again!
264 #$self->utf8_write_file (\$text, $conv_filename);
265}
266
267# Modified to cache HTML files for efficieny reasons rather
268# than delete all. HTML is modified not to use IE's VML.
269# VML uses WML files, so these can be deleted.
270sub cleanup_tmp_area {
271 my ($self) = @_;
272 if (defined $self->{'files_dir'}) {
273 my $html_files_dir = $self->{'files_dir'};
274
275 if (opendir(DIN,$html_files_dir)) {
276 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
277 foreach my $f (@wmz_files) {
278 my $full_f = &util::filename_cat($html_files_dir,$f);
279 &util::rm($full_f);
280 }
281 closedir(DIN);
282 }
283 else {
284 # if HTML file has no supporting images, then no _files dir made
285 # => do nothing
286 }
287 }
288}
289
290
2911;
292
Note: See TracBrowser for help on using the repository browser.