source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36sub BEGIN {
37 @WordPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
38}
39
40my $openoffice_available = 0;
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BaseImporter.process_exp}",
45 'type' => "regexp",
46 'deft' => "&get_default_process_exp()", # delayed (see below)
47 'reqd' => "no" },
48 { 'name' => "description_tags",
49 'desc' => "{HTMLPlugin.description_tags}",
50 'type' => "flag" }
51 ];
52
53
54my $opt_windows_args = [ { 'name' => "windows_scripting",
55 'desc' => "{WordPlugin.windows_scripting}",
56 'type' => "flag",
57
58 'reqd' => "no" } ];
59
60my $opt_office_args = [ { 'name' => "metadata_fields",
61 'desc' => "{WordPlugin.metadata_fields}",
62 'type' => "string",
63 'deft' => "Title" },
64 { 'name' => "level1_header",
65 'desc' => "{StructuredHTMLPlugin.level1_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level2_header",
70 'desc' => "{StructuredHTMLPlugin.level2_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "level3_header",
75 'desc' => "{StructuredHTMLPlugin.level3_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "title_header",
80 'desc' => "{StructuredHTMLPlugin.title_header}",
81 'type' => "regexp",
82 'reqd' => "no",
83 'deft' => "" },
84 { 'name' => "delete_toc",
85 'desc' => "{StructuredHTMLPlugin.delete_toc}",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "toc_header",
89 'desc' => "{StructuredHTMLPlugin.toc_header}",
90 'type' => "regexp",
91 'reqd' => "no",
92 'deft' => "" } ];
93
94
95my $options = { 'name' => "WordPlugin",
96 'desc' => "{WordPlugin.desc}",
97 'abstract' => "no",
98 'inherits' => "yes",
99 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
100 'args' => $arguments };
101
102sub new {
103 my ($class) = shift (@_);
104 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
105 push(@$pluginlist, $class);
106
107 # this bit needs to happen later after the arguments array has been
108 # finished - used for parsing the input args.
109 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 # this one needs to go in first, to get the print info in the right order
111 push(@{$hashArgOptLists->{"OptList"}},$options);
112
113 my $office_capable = 0;
114 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
115 push(@$arguments,@$opt_windows_args);
116 $office_capable = 1;
117 }
118
119 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
120
121 if ($auto_converter_self->{'openoffice_available'}) {
122 $office_capable = 1;
123 $openoffice_available = 1;
124 }
125
126 # these office args apply to windows scripting or to openoffice conversion
127 if ($office_capable) {
128 push(@$arguments,@$opt_office_args);
129 }
130
131 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
132 foreach my $a (@$arguments) {
133 if ($a->{'name'} eq "process_exp") {
134 my $eval_expr = $a->{'deft'};
135 $a->{'deft'} = eval "$eval_expr";
136 last;
137 }
138 }
139
140 # have finished modifying our arguments, add them to ArgList
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142
143 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
144 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
145
146 if ($self->{'info_only'}) {
147 # don't worry about any options etc
148 return bless $self, $class;
149 }
150
151 $self = bless $self, $class;
152 $self->{'file_type'} = "Word";
153
154 my $outhandle = $self->{'outhandle'};
155
156 if ($self->{'windows_scripting'}) {
157 $self->{'convert_options'} = "-windows_scripting";
158 $self->{'office_scripting'} = 1;
159 }
160 if ($self->{'openoffice_conversion'}) {
161 if ($self->{'windows_scripting'}) {
162 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
163 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
164 $self->{'openoffice_conversion'} = 0;
165 }
166 else {
167 $self->{'office_scripting'} = 1;
168 }
169 }
170
171 # check convert_to
172 if ($self->{'convert_to'} eq "auto") {
173 $self->{'convert_to'} = "html";
174 }
175 # windows or open office scripting, outputs structuredHTML
176 if (defined $self->{'office_scripting'}) {
177 $self->{'convert_to'} = "structuredhtml";
178 }
179
180 # set convert_to_plugin and convert_to_ext
181 $self->set_standard_convert_settings();
182
183 my $secondary_plugin_name = $self->{'convert_to_plugin'};
184 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
185
186 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
187 $secondary_plugin_options->{$secondary_plugin_name} = [];
188 }
189 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
190
191 # following title_sub removes "Page 1" and a leading
192 # "1", which is often the page number at the top of the page. Bad Luck
193 # if your document title actually starts with "1 " - is there a better way?
194 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
195
196 my $associate_tail_re = $self->{'associate_tail_re'};
197 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
198 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
199 }
200 push(@$specific_options, "-file_rename_method", "none");
201
202 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
203 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
204 # to extract these metadata fields from the HEAD META fields
205 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
206 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
207 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
208 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
209 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
210 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
211 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
212 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
213 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
214 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
215 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
216 push(@$specific_options, "-processing_tmp_files");
217
218 }
219
220 elsif ($secondary_plugin_name eq "HTMLPlugin") {
221 push(@$specific_options, "-processing_tmp_files");
222 push(@$specific_options,"-input_encoding", "utf8");
223 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
224 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
225 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
226 # to extract these metadata fields from the HEAD META fields
227 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
228 }
229
230 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
231
232 return $self;
233}
234
235sub get_default_process_exp {
236 my $self = shift (@_);
237
238 if ($openoffice_available) {
239 return q^(?i)\.(doc|dot|docx|odt|wpd)$^;
240 } elsif ($ENV{'GSDLOS'} =~ m/^windows$/i) {
241 # if OS is windows, can try using docx2html vbs script to see if they have Word 2007
242 # if the user turns windows_scripting on
243 return q^(?i)\.(docx?|dot)$^;
244 }
245 return q^(?i)\.(doc|dot)$^;
246}
247
248sub init {
249 my $self = shift (@_);
250
251 # ConvertBinaryFile init
252 $self->SUPER::init(@_);
253 $self->AutoLoadConverters::init(@_);
254
255}
256
257sub begin {
258 my $self = shift (@_);
259
260 $self->AutoLoadConverters::begin(@_);
261 $self->SUPER::begin(@_);
262
263}
264
265sub deinit {
266 my $self = shift (@_);
267
268 $self->AutoLoadConverters::deinit(@_);
269 $self->SUPER::deinit(@_);
270
271}
272
273sub tmp_area_convert_file {
274
275 my $self = shift (@_);
276 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
277
278}
279
280
281sub convert_post_process_old
282{
283 my $self = shift (@_);
284 my ($conv_filename) = @_;
285
286 my $outhandle=$self->{'outhandle'};
287
288 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
289
290 # read in file ($text will be in utf8)
291 my $text = "";
292 $self->read_file ($conv_filename, $encoding, $language, \$text);
293
294 # turn any high bytes that aren't valid utf-8 into utf-8.
295 #unicode::ensure_utf8(\$text);
296
297 # Write it out again!
298 #$self->utf8_write_file (\$text, $conv_filename);
299}
300
301# Modified to cache HTML files for efficieny reasons rather
302# than delete all. HTML is modified not to use IE's VML.
303# VML uses WML files, so these can be deleted.
304sub cleanup_tmp_area {
305 my ($self) = @_;
306 if (defined $self->{'files_dir'}) {
307 my $html_files_dir = $self->{'files_dir'};
308
309 if (opendir(DIN,$html_files_dir)) {
310 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
311 foreach my $f (@wmz_files) {
312 my $full_f = &FileUtils::filenameConcatenate($html_files_dir,$f);
313 &FileUtils::removeFiles($full_f);
314 }
315 closedir(DIN);
316 }
317 else {
318 # if HTML file has no supporting images, then no _files dir made
319 # => do nothing
320 }
321 }
322}
323
324
3251;
326
Note: See TracBrowser for help on using the repository browser.