Context Navigation

source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 23484

Last change on this file since 23484 was 22894, checked in by kjdon, 14 years ago
added wpd (word perfect) extension into the list that can be processed by open office
Property svn:keywords set to `Author Date Id Revision`
File size: 10.3 KB

Line
1	###########################################################################
2	#
3	# WordPlugin.pm -- plugin for importing Microsoft Word documents
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package WordPlugin;
26
27	use strict;
28	no strict 'refs'; # allow filehandles to be variables and viceversa
29	no strict 'subs';
30
31	use gsprintf 'gsprintf';
32
33	use AutoLoadConverters;
34	use ConvertBinaryFile;
35
36	sub BEGIN {
37	@WordPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
38	}
39
40	my $openoffice_available = 0;
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasePlugin.process_exp}",
45	'type' => "regexp",
46	'deft' => "&get_default_process_exp()", # delayed (see below)
47	'reqd' => "no" },
48	{ 'name' => "description_tags",
49	'desc' => "{HTMLPlugin.description_tags}",
50	'type' => "flag" }
51	];
52
53
54	my $opt_windows_args = [ { 'name' => "windows_scripting",
55	'desc' => "{WordPlugin.windows_scripting}",
56	'type' => "flag",
57
58	'reqd' => "no" } ];
59
60	my $opt_office_args = [ { 'name' => "metadata_fields",
61	'desc' => "{WordPlugin.metadata_fields}",
62	'type' => "string",
63	'deft' => "Title" },
64	{ 'name' => "level1_header",
65	'desc' => "{StructuredHTMLPlugin.level1_header}",
66	'type' => "regexp",
67	'reqd' => "no",
68	'deft' => "" },
69	{ 'name' => "level2_header",
70	'desc' => "{StructuredHTMLPlugin.level2_header}",
71	'type' => "regexp",
72	'reqd' => "no",
73	'deft' => "" },
74	{ 'name' => "level3_header",
75	'desc' => "{StructuredHTMLPlugin.level3_header}",
76	'type' => "regexp",
77	'reqd' => "no",
78	'deft' => "" },
79	{ 'name' => "title_header",
80	'desc' => "{StructuredHTMLPlugin.title_header}",
81	'type' => "regexp",
82	'reqd' => "no",
83	'deft' => "" },
84	{ 'name' => "delete_toc",
85	'desc' => "{StructuredHTMLPlugin.delete_toc}",
86	'type' => "flag",
87	'reqd' => "no" },
88	{ 'name' => "toc_header",
89	'desc' => "{StructuredHTMLPlugin.toc_header}",
90	'type' => "regexp",
91	'reqd' => "no",
92	'deft' => "" } ];
93
94
95	my $options = { 'name' => "WordPlugin",
96	'desc' => "{WordPlugin.desc}",
97	'abstract' => "no",
98	'inherits' => "yes",
99	'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
100	'args' => $arguments };
101
102	sub new {
103	my ($class) = shift (@_);
104	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
105	push(@$pluginlist, $class);
106
107	# this bit needs to happen later after the arguments array has been
108	# finished - used for parsing the input args.
109	# push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110	# this one needs to go in first, to get the print info in the right order
111	push(@{$hashArgOptLists->{"OptList"}},$options);
112
113	my $office_capable = 0;
114	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
115	push(@$arguments,@$opt_windows_args);
116	$office_capable = 1;
117	}
118
119	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
120
121	if ($auto_converter_self->{'openoffice_available'}) {
122	$office_capable = 1;
123	$openoffice_available = 1;
124	}
125
126	# these office args apply to windows scripting or to openoffice conversion
127	if ($office_capable) {
128	push(@$arguments,@$opt_office_args);
129	}
130
131	# evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
132	foreach my $a (@$arguments) {
133	if ($a->{'name'} eq "process_exp") {
134	my $eval_expr = $a->{'deft'};
135	$a->{'deft'} = eval "$eval_expr";
136	last;
137	}
138	}
139
140	# have finished modifying our arguments, add them to ArgList
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142
143	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
144	my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
145
146	if ($self->{'info_only'}) {
147	# don't worry about any options etc
148	return bless $self, $class;
149	}
150
151	$self = bless $self, $class;
152	$self->{'file_type'} = "Word";
153
154	my $outhandle = $self->{'outhandle'};
155
156	if ($self->{'windows_scripting'}) {
157	$self->{'convert_options'} = "-windows_scripting";
158	$self->{'office_scripting'} = 1;
159	}
160	if ($self->{'openoffice_conversion'}) {
161	if ($self->{'windows_scripting'}) {
162	print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
163	print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
164	$self->{'openoffice_conversion'} = 0;
165	}
166	else {
167	$self->{'office_scripting'} = 1;
168	}
169	}
170
171	# check convert_to
172	if ($self->{'convert_to'} eq "auto") {
173	$self->{'convert_to'} = "html";
174	}
175	# windows or open office scripting, outputs structuredHTML
176	if (defined $self->{'office_scripting'}) {
177	$self->{'convert_to'} = "structuredhtml";
178	}
179
180	# set convert_to_plugin and convert_to_ext
181	$self->set_standard_convert_settings();
182
183	my $secondary_plugin_name = $self->{'convert_to_plugin'};
184	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
185
186	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
187	$secondary_plugin_options->{$secondary_plugin_name} = [];
188	}
189	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
190
191	# following title_sub removes "Page 1" and a leading
192	# "1", which is often the page number at the top of the page. Bad Luck
193	# if your document title actually starts with "1 " - is there a better way?
194	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
195
196	my $associate_tail_re = $self->{'associate_tail_re'};
197	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
198	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
199	}
200	push(@$specific_options, "-file_rename_method", "none");
201
202	if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
203	# Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
204	# to extract these metadata fields from the HEAD META fields
205	push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
206	push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
207	push (@$specific_options, "-extract_language") if $self->{'extract_language'};
208	push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
209	push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
210	push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
211	push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
212	push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
213	push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
214	push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
215	push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
216	push(@$specific_options, "-processing_tmp_files");
217
218	}
219
220	elsif ($secondary_plugin_name eq "HTMLPlugin") {
221	push(@$specific_options, "-processing_tmp_files");
222	push(@$specific_options,"-input_encoding", "utf8");
223	push(@$specific_options,"-extract_language") if $self->{'extract_language'};
224	push(@$specific_options, "-description_tags") if $self->{'description_tags'};
225	# Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
226	# to extract these metadata fields from the HEAD META fields
227	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
228	}
229
230	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
231
232	return $self;
233	}
234
235	sub get_default_process_exp {
236	my $self = shift (@_);
237
238	if ($openoffice_available) {
239	return q^(?i)\.(doc\|dot\|docx\|odt\|wpd)$^;
240	}
241	return q^(?i)\.(doc\|dot)$^;
242	}
243
244	sub init {
245	my $self = shift (@_);
246
247	# ConvertBinaryFile init
248	$self->SUPER::init(@_);
249	$self->AutoLoadConverters::init();
250
251	}
252
253	sub begin {
254	my $self = shift (@_);
255
256	$self->AutoLoadConverters::begin();
257	$self->SUPER::begin(@_);
258
259	}
260
261	sub deinit {
262	my $self = shift (@_);
263
264	$self->AutoLoadConverters::deinit();
265	$self->SUPER::deinit(@_);
266
267	}
268
269	sub tmp_area_convert_file {
270
271	my $self = shift (@_);
272	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
273
274	}
275
276
277	sub convert_post_process_old
278	{
279	my $self = shift (@_);
280	my ($conv_filename) = @_;
281
282	my $outhandle=$self->{'outhandle'};
283
284	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
285
286	# read in file ($text will be in utf8)
287	my $text = "";
288	$self->read_file ($conv_filename, $encoding, $language, \$text);
289
290	# turn any high bytes that aren't valid utf-8 into utf-8.
291	#unicode::ensure_utf8(\$text);
292
293	# Write it out again!
294	#$self->utf8_write_file (\$text, $conv_filename);
295	}
296
297	# Modified to cache HTML files for efficieny reasons rather
298	# than delete all. HTML is modified not to use IE's VML.
299	# VML uses WML files, so these can be deleted.
300	sub cleanup_tmp_area {
301	my ($self) = @_;
302	if (defined $self->{'files_dir'}) {
303	my $html_files_dir = $self->{'files_dir'};
304
305	if (opendir(DIN,$html_files_dir)) {
306	my @wmz_files = grep( /\.wmz$/, readdir(DIN));
307	foreach my $f (@wmz_files) {
308	my $full_f = &util::filename_cat($html_files_dir,$f);
309	&util::rm($full_f);
310	}
311	closedir(DIN);
312	}
313	else {
314	# if HTML file has no supporting images, then no _files dir made
315	# => do nothing
316	}
317	}
318	}
319
320
321	1;
322

Note: See TracBrowser for help on using the repository browser.

Download in other formats: