Context Navigation

source: gsdl/trunk/perllib/plugins/WordPlugin.pm@ 18406

Last change on this file since 18406 was 18406, checked in by ak19, 15 years ago
Modified srcreplaceable plugins (plugins which operate on docs where the source file can be replaced with their converted htmls) to set the file_rename_method to none for secondary plugins (for Text, HTML, and PagedImage plugins) so that the file is not renamed several times.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.4 KB

Line
1	###########################################################################
2	#
3	# WordPlugin.pm -- plugin for importing Microsoft Word documents
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package WordPlugin;
26
27	use ConvertBinaryFile;
28	use strict;
29	no strict 'refs'; # allow filehandles to be variables and viceversa
30
31	sub BEGIN {
32	@WordPlugin::ISA = ('ConvertBinaryFile');
33	}
34
35	my $arguments =
36	[ { 'name' => "process_exp",
37	'desc' => "{BasePlugin.process_exp}",
38	'type' => "regexp",
39	'deft' => &get_default_process_exp(),
40	'reqd' => "no" },
41	{ 'name' => "description_tags",
42	'desc' => "{HTMLPlugin.description_tags}",
43	'type' => "flag" }
44	];
45
46	my $options = { 'name' => "WordPlugin",
47	'desc' => "{WordPlugin.desc}",
48	'abstract' => "no",
49	'inherits' => "yes",
50	'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
51	'args' => $arguments };
52
53	sub new {
54	my ($class) = shift (@_);
55	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56	push(@$pluginlist, $class);
57
58	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59	my $ws_arg = [ { 'name' => "windows_scripting",
60	'desc' => "{WordPlugin.windows_scripting}",
61	'type' => "flag",
62	'reqd' => "no" },
63	{ 'name' => "metadata_fields",
64	'desc' => "{WordPlugin.metadata_fields}",
65	'type' => "string",
66	'deft' => "Title" },
67	{ 'name' => "level1_header",
68	'desc' => "{StructuredHTMLPlugin.level1_header}",
69	'type' => "regexp",
70	'reqd' => "no",
71	'deft' => "" },
72	{ 'name' => "level2_header",
73	'desc' => "{StructuredHTMLPlugin.level2_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" },
77	{ 'name' => "level3_header",
78	'desc' => "{StructuredHTMLPlugin.level3_header}",
79	'type' => "regexp",
80	'reqd' => "no",
81	'deft' => "" },
82	{ 'name' => "title_header",
83	'desc' => "{StructuredHTMLPlugin.title_header}",
84	'type' => "regexp",
85	'reqd' => "no",
86	'deft' => "" },
87	{ 'name' => "delete_toc",
88	'desc' => "{StructuredHTMLPlugin.delete_toc}",
89	'type' => "flag",
90	'reqd' => "no",
91	# set the mode so this doesn't show up unless
92	# all the following ones do
93	'modegli' => "3"},
94	{ 'name' => "toc_header",
95	'desc' => "{StructuredHTMLPlugin.toc_header}",
96	'type' => "regexp",
97	'reqd' => "no",
98	'deft' => "" }
99	];
100
101	push(@$arguments,@$ws_arg);
102	}
103
104	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
105	push(@{$hashArgOptLists->{"OptList"}},$options);
106
107	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
108
109	if ($self->{'info_only'}) {
110	# don't worry about any options etc
111	return bless $self, $class;
112	}
113
114	$self->{'filename_extension'} = "doc";
115	$self->{'file_type'} = "Word";
116
117	#this is passed through to gsConvert.pl by ConvertBinaryFile.pm
118	$self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
119
120	# we always save as utf-8
121	# if ($self->{'input_encoding'} eq "auto") {
122	# $self->{'input_encoding'} = "utf8";
123	# }
124
125	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
126	if (defined $self->{'windows_scripting'}) {
127	if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
128	$secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
129	my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
130
131	# Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
132	# to extract these metadata fields from the HEAD META fields
133	push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
134	push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
135	push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
136	push (@$structhtml_options, "-extract_language") if $self->{'extract_language'};
137	push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
138	push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
139	push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
140	push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
141	push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
142	push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
143	push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
144	}
145	}
146	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
147	$secondary_plugin_options->{'HTMLPlugin'} = [];
148	}
149	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
150	$secondary_plugin_options->{'TextPlugin'} = [];
151	}
152
153	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
154	my $text_options = $secondary_plugin_options->{'TextPlugin'};
155	my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
156	# wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
157	push(@$html_options,"-input_encoding", "utf8");
158	push(@$html_options,"-extract_language") if $self->{'extract_language'};
159	push(@$html_options, "-description_tags") if $self->{'description_tags'};
160
161	# Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
162	# to extract these metadata fields from the HEAD META fields
163	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
164	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
165
166	my $associate_tail_re = $self->{'associate_tail_re'};
167	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
168	push(@$html_options, "-associate_tail_re", $associate_tail_re);
169	push(@$text_options, "-associate_tail_re", $associate_tail_re);
170	push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options;
171	}
172
173	push(@$html_options, "-file_rename_method", "none");
174	push(@$text_options, "-file_rename_method", "none");
175	push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options;
176
177	$self = bless $self, $class;
178	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
179
180	return bless $self;
181	}
182
183	sub get_default_process_exp {
184	my $self = shift (@_);
185
186	return q^(?i)\.(doc\|dot)$^;
187	}
188
189	sub convert_post_process_old
190	{
191	my $self = shift (@_);
192	my ($conv_filename) = @_;
193
194	my $outhandle=$self->{'outhandle'};
195
196	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
197
198	# read in file ($text will be in utf8)
199	my $text = "";
200	$self->read_file ($conv_filename, $encoding, $language, \$text);
201
202	# turn any high bytes that aren't valid utf-8 into utf-8.
203	#unicode::ensure_utf8(\$text);
204
205	# Write it out again!
206	#$self->utf8_write_file (\$text, $conv_filename);
207	}
208
209	# Modified to cache HTML files for efficieny reasons rather
210	# than delete all. HTML is modified not to use IE's VML.
211	# VML uses WML files, so these can be deleted.
212	sub cleanup_tmp_area {
213	my ($self) = @_;
214	if (defined $self->{'files_dir'}) {
215	my $html_files_dir = $self->{'files_dir'};
216
217	if (opendir(DIN,$html_files_dir)) {
218	my @wmz_files = grep( /\.wmz$/, readdir(DIN));
219	foreach my $f (@wmz_files) {
220	my $full_f = &util::filename_cat($html_files_dir,$f);
221	&util::rm($full_f);
222	}
223	closedir(DIN);
224	}
225	else {
226	# if HTML file has no supporting images, then no _files dir made
227	# => do nothing
228	}
229	}
230	}
231
232
233	1;
234

Note: See TracBrowser for help on using the repository browser.

Download in other formats: