Context Navigation

source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 20999

Last change on this file since 20999 was 20790, checked in by kjdon, 15 years ago
set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)
Property svn:keywords set to `Author Date Id Revision`
File size: 8.5 KB

Line
1	###########################################################################
2	#
3	# WordPlugin.pm -- plugin for importing Microsoft Word documents
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package WordPlugin;
26
27	use ConvertBinaryFile;
28	use strict;
29	no strict 'refs'; # allow filehandles to be variables and viceversa
30
31	sub BEGIN {
32	@WordPlugin::ISA = ('ConvertBinaryFile');
33	}
34
35	my $arguments =
36	[ { 'name' => "process_exp",
37	'desc' => "{BasePlugin.process_exp}",
38	'type' => "regexp",
39	'deft' => &get_default_process_exp(),
40	'reqd' => "no" },
41	{ 'name' => "description_tags",
42	'desc' => "{HTMLPlugin.description_tags}",
43	'type' => "flag" }
44	];
45
46	my $options = { 'name' => "WordPlugin",
47	'desc' => "{WordPlugin.desc}",
48	'abstract' => "no",
49	'inherits' => "yes",
50	'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
51	'args' => $arguments };
52
53	sub new {
54	my ($class) = shift (@_);
55	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56	push(@$pluginlist, $class);
57
58	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59	my $ws_arg = [ { 'name' => "windows_scripting",
60	'desc' => "{WordPlugin.windows_scripting}",
61	'type' => "flag",
62	'reqd' => "no" },
63	{ 'name' => "metadata_fields",
64	'desc' => "{WordPlugin.metadata_fields}",
65	'type' => "string",
66	'deft' => "Title" },
67	{ 'name' => "level1_header",
68	'desc' => "{StructuredHTMLPlugin.level1_header}",
69	'type' => "regexp",
70	'reqd' => "no",
71	'deft' => "" },
72	{ 'name' => "level2_header",
73	'desc' => "{StructuredHTMLPlugin.level2_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" },
77	{ 'name' => "level3_header",
78	'desc' => "{StructuredHTMLPlugin.level3_header}",
79	'type' => "regexp",
80	'reqd' => "no",
81	'deft' => "" },
82	{ 'name' => "title_header",
83	'desc' => "{StructuredHTMLPlugin.title_header}",
84	'type' => "regexp",
85	'reqd' => "no",
86	'deft' => "" },
87	{ 'name' => "delete_toc",
88	'desc' => "{StructuredHTMLPlugin.delete_toc}",
89	'type' => "flag",
90	'reqd' => "no" },
91	{ 'name' => "toc_header",
92	'desc' => "{StructuredHTMLPlugin.toc_header}",
93	'type' => "regexp",
94	'reqd' => "no",
95	'deft' => "" }
96	];
97
98	push(@$arguments,@$ws_arg);
99	}
100
101	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102	push(@{$hashArgOptLists->{"OptList"}},$options);
103
104	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
105
106	if ($self->{'info_only'}) {
107	# don't worry about any options etc
108	return bless $self, $class;
109	}
110
111	$self->{'filename_extension'} = "doc";
112	$self->{'file_type'} = "Word";
113
114	#this is passed through to gsConvert.pl by ConvertBinaryFile.pm
115	$self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
116
117	# we always save as utf-8
118	# if ($self->{'input_encoding'} eq "auto") {
119	# $self->{'input_encoding'} = "utf8";
120	# }
121
122	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
123	if (defined $self->{'windows_scripting'}) {
124	if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
125	$secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
126	my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
127
128	# Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
129	# to extract these metadata fields from the HEAD META fields
130	push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
131	push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
132	push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
133	push (@$structhtml_options, "-extract_language") if $self->{'extract_language'};
134	push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
135	push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
136	push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
137	push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
138	push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
139	push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
140	push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
141	}
142	}
143	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
144	$secondary_plugin_options->{'HTMLPlugin'} = [];
145	}
146	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
147	$secondary_plugin_options->{'TextPlugin'} = [];
148	}
149
150	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
151	my $text_options = $secondary_plugin_options->{'TextPlugin'};
152	my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
153	# tell the secondary plugins that they are processing tmp files
154	push(@$html_options, "-processing_tmp_files");
155	push(@$structhtml_options, "-processing_tmp_files");
156
157	# wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
158	push(@$html_options,"-input_encoding", "utf8");
159	push(@$html_options,"-extract_language") if $self->{'extract_language'};
160	push(@$html_options, "-description_tags") if $self->{'description_tags'};
161
162	# Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
163	# to extract these metadata fields from the HEAD META fields
164	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
165	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
166
167	my $associate_tail_re = $self->{'associate_tail_re'};
168	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
169	push(@$html_options, "-associate_tail_re", $associate_tail_re);
170	push(@$text_options, "-associate_tail_re", $associate_tail_re);
171	push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options;
172	}
173
174	push(@$html_options, "-file_rename_method", "none");
175	push(@$text_options, "-file_rename_method", "none");
176	push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options;
177
178	$self = bless $self, $class;
179	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
180
181	return bless $self;
182	}
183
184	sub get_default_process_exp {
185	my $self = shift (@_);
186
187	return q^(?i)\.(doc\|dot)$^;
188	}
189
190	sub convert_post_process_old
191	{
192	my $self = shift (@_);
193	my ($conv_filename) = @_;
194
195	my $outhandle=$self->{'outhandle'};
196
197	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
198
199	# read in file ($text will be in utf8)
200	my $text = "";
201	$self->read_file ($conv_filename, $encoding, $language, \$text);
202
203	# turn any high bytes that aren't valid utf-8 into utf-8.
204	#unicode::ensure_utf8(\$text);
205
206	# Write it out again!
207	#$self->utf8_write_file (\$text, $conv_filename);
208	}
209
210	# Modified to cache HTML files for efficieny reasons rather
211	# than delete all. HTML is modified not to use IE's VML.
212	# VML uses WML files, so these can be deleted.
213	sub cleanup_tmp_area {
214	my ($self) = @_;
215	if (defined $self->{'files_dir'}) {
216	my $html_files_dir = $self->{'files_dir'};
217
218	if (opendir(DIN,$html_files_dir)) {
219	my @wmz_files = grep( /\.wmz$/, readdir(DIN));
220	foreach my $f (@wmz_files) {
221	my $full_f = &util::filename_cat($html_files_dir,$f);
222	&util::rm($full_f);
223	}
224	closedir(DIN);
225	}
226	else {
227	# if HTML file has no supporting images, then no _files dir made
228	# => do nothing
229	}
230	}
231	}
232
233
234	1;
235

Note: See TracBrowser for help on using the repository browser.

Download in other formats: