source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 36655

Last change on this file since 36655 was 34997, checked in by davidb, 3 years ago

When working with orthogonal indexes, these plugins constructors get called a second time; however the way the eval expression was written resulted in an error. Change is to test to see if the eval_expr is still in the form of '&...' and if it is, let the eval go ahead. Otherwise (i.e. second time in constructor) it has already been evaluated and restored under the 'deft' name, in which case no futher work needs to be done

  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36sub BEGIN {
37 @WordPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
38}
39
40my $openoffice_available = 0;
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BaseImporter.process_exp}",
45 'type' => "regexp",
46 'deft' => "&get_default_process_exp()", # delayed (see below)
47 'reqd' => "no" },
48 { 'name' => "description_tags",
49 'desc' => "{HTMLPlugin.description_tags}",
50 'type' => "flag" }
51 ];
52
53
54my $opt_windows_args = [ { 'name' => "windows_scripting",
55 'desc' => "{WordPlugin.windows_scripting}",
56 'type' => "flag",
57
58 'reqd' => "no" } ];
59
60my $opt_office_args = [ { 'name' => "metadata_fields",
61 'desc' => "{WordPlugin.metadata_fields}",
62 'type' => "string",
63 'deft' => "Title" },
64 { 'name' => "level1_header",
65 'desc' => "{StructuredHTMLPlugin.level1_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level2_header",
70 'desc' => "{StructuredHTMLPlugin.level2_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "level3_header",
75 'desc' => "{StructuredHTMLPlugin.level3_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "title_header",
80 'desc' => "{StructuredHTMLPlugin.title_header}",
81 'type' => "regexp",
82 'reqd' => "no",
83 'deft' => "" },
84 { 'name' => "delete_toc",
85 'desc' => "{StructuredHTMLPlugin.delete_toc}",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "toc_header",
89 'desc' => "{StructuredHTMLPlugin.toc_header}",
90 'type' => "regexp",
91 'reqd' => "no",
92 'deft' => "" } ];
93
94
95my $options = { 'name' => "WordPlugin",
96 'desc' => "{WordPlugin.desc}",
97 'abstract' => "no",
98 'inherits' => "yes",
99 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
100 'args' => $arguments };
101
102sub new {
103 my ($class) = shift (@_);
104 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
105 push(@$pluginlist, $class);
106
107 # this bit needs to happen later after the arguments array has been
108 # finished - used for parsing the input args.
109 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 # this one needs to go in first, to get the print info in the right order
111 push(@{$hashArgOptLists->{"OptList"}},$options);
112
113 my $office_capable = 0;
114 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
115 push(@$arguments,@$opt_windows_args);
116 $office_capable = 1;
117 }
118
119 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
120
121 if ($auto_converter_self->{'openoffice_available'}) {
122 $office_capable = 1;
123 $openoffice_available = 1;
124 }
125
126 # these office args apply to windows scripting or to openoffice conversion
127 if ($office_capable) {
128 push(@$arguments,@$opt_office_args);
129 }
130
131 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
132 foreach my $a (@$arguments) {
133 if ($a->{'name'} eq "process_exp") {
134 my $eval_expr = $a->{'deft'};
135 if ($eval_expr =~ m/^&/) {
136 $a->{'deft'} = eval "$eval_expr";
137 }
138 last;
139 }
140 }
141
142 # have finished modifying our arguments, add them to ArgList
143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144
145 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
146 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
147
148 if ($self->{'info_only'}) {
149 # don't worry about any options etc
150 return bless $self, $class;
151 }
152
153 $self = bless $self, $class;
154 $self->{'file_type'} = "Word";
155
156 my $outhandle = $self->{'outhandle'};
157
158 if ($self->{'windows_scripting'}) {
159 $self->{'convert_options'} = "-windows_scripting";
160 $self->{'office_scripting'} = 1;
161 }
162 if ($self->{'openoffice_conversion'}) {
163 if ($self->{'windows_scripting'}) {
164 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
165 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
166 $self->{'openoffice_conversion'} = 0;
167 }
168 else {
169 $self->{'office_scripting'} = 1;
170 }
171 }
172
173 # check convert_to
174 if ($self->{'convert_to'} eq "auto") {
175 $self->{'convert_to'} = "html";
176 }
177 # windows or open office scripting, outputs structuredHTML
178 if (defined $self->{'office_scripting'}) {
179 $self->{'convert_to'} = "structuredhtml";
180 }
181
182 # set convert_to_plugin and convert_to_ext
183 $self->set_standard_convert_settings();
184
185 my $secondary_plugin_name = $self->{'convert_to_plugin'};
186 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
187
188 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
189 $secondary_plugin_options->{$secondary_plugin_name} = [];
190 }
191 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
192
193 # following title_sub removes "Page 1" and a leading
194 # "1", which is often the page number at the top of the page. Bad Luck
195 # if your document title actually starts with "1 " - is there a better way?
196 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
197
198 my $associate_tail_re = $self->{'associate_tail_re'};
199 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
200 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
201 }
202 push(@$specific_options, "-file_rename_method", "none");
203
204 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
205 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
206 # to extract these metadata fields from the HEAD META fields
207 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
208 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
209 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
210 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
211 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
212 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
213 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
214 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
215 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
216 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
217 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
218 push(@$specific_options, "-processing_tmp_files");
219
220 }
221
222 elsif ($secondary_plugin_name eq "HTMLPlugin") {
223 push(@$specific_options, "-processing_tmp_files");
224 push(@$specific_options,"-input_encoding", "utf8");
225 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
226 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
227 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
228 # to extract these metadata fields from the HEAD META fields
229 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
230 }
231
232 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
233
234 return $self;
235}
236
237sub get_default_process_exp {
238 my $self = shift (@_);
239
240 if ($openoffice_available) {
241 return q^(?i)\.(doc|dot|docx|odt|wpd)$^;
242 } elsif ($ENV{'GSDLOS'} =~ m/^windows$/i) {
243 # if OS is windows, can try using docx2html vbs script to see if they have Word 2007
244 # if the user turns windows_scripting on
245 return q^(?i)\.(docx?|dot)$^;
246 }
247 return q^(?i)\.(doc|dot)$^;
248}
249
250sub init {
251 my $self = shift (@_);
252
253 # ConvertBinaryFile init
254 $self->SUPER::init(@_);
255 $self->AutoLoadConverters::init(@_);
256
257}
258
259sub begin {
260 my $self = shift (@_);
261
262 $self->AutoLoadConverters::begin(@_);
263 $self->SUPER::begin(@_);
264
265}
266
267sub deinit {
268 my $self = shift (@_);
269
270 $self->AutoLoadConverters::deinit(@_);
271 $self->SUPER::deinit(@_);
272
273}
274
275sub tmp_area_convert_file {
276
277 my $self = shift (@_);
278 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
279
280}
281
282
283sub convert_post_process_old
284{
285 my $self = shift (@_);
286 my ($conv_filename) = @_;
287
288 my $outhandle=$self->{'outhandle'};
289
290 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
291
292 # read in file ($text will be in utf8)
293 my $text = "";
294 $self->read_file ($conv_filename, $encoding, $language, \$text);
295
296 # turn any high bytes that aren't valid utf-8 into utf-8.
297 #unicode::ensure_utf8(\$text);
298
299 # Write it out again!
300 #$self->utf8_write_file (\$text, $conv_filename);
301}
302
303# Modified to cache HTML files for efficieny reasons rather
304# than delete all. HTML is modified not to use IE's VML.
305# VML uses WML files, so these can be deleted.
306sub cleanup_tmp_area {
307 my ($self) = @_;
308 if (defined $self->{'files_dir'}) {
309 my $html_files_dir = $self->{'files_dir'};
310
311 if (opendir(DIN,$html_files_dir)) {
312 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
313 foreach my $f (@wmz_files) {
314 my $full_f = &FileUtils::filenameConcatenate($html_files_dir,$f);
315 &FileUtils::removeFiles($full_f);
316 }
317 closedir(DIN);
318 }
319 else {
320 # if HTML file has no supporting images, then no _files dir made
321 # => do nothing
322 }
323 }
324}
325
326
3271;
328
Note: See TracBrowser for help on using the repository browser.