Context Navigation

source: trunk/gsdl/perllib/plugins/WordPlug.pm@ 10429

Last change on this file since 10429 was 10428, checked in by chi, 19 years ago
Modification of the way passing argument and option lists for the secondary plugin. Also, add an option (extracted_word_metadata) to retrieve metadata based on user-defined fields from the html document converted by VB scripting.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.1 KB

Line
1	###########################################################################
2	#
3	# WordPlug.pm -- plugin for importing Microsoft Word documents
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	# 12/05/02 Added usage datastructure - John Thompson
26
27	package WordPlug;
28
29	use ConvertToPlug;
30	require StructuredHTMLPlug;
31	use strict;
32	no strict 'refs'; # allow filehandles to be variables and viceversa
33
34	sub BEGIN {
35	#@WordPlug::ISA = ('ConvertToPlug','StructuredHTMLPlug');
36	@WordPlug::ISA = ('ConvertToPlug');
37	}
38
39	my $arguments =
40	[ { 'name' => "process_exp",
41	'desc' => "{BasPlug.process_exp}",
42	'type' => "regexp",
43	'deft' => &get_default_process_exp(),
44	'reqd' => "no" }
45	];
46
47	my $options = { 'name' => "WordPlug",
48	'desc' => "{WordPlug.desc}",
49	'abstract' => "no",
50	'inherits' => "yes",
51	'args' => $arguments };
52
53	sub new {
54	my ($class) = shift (@_);
55	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56	push(@$pluginlist, $class);
57
58	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59	my $ws_arg = [ { 'name' => "windows_scripting",
60	'desc' => "{WordPlug.windows_scripting}",
61	'type' => "flag",
62	'reqd' => "no" },
63	{ 'name' => "checkout_toc",
64	'desc' => "{WordPlug.checkout_toc}",
65	'type' => "flag",
66	'reqd' => "no"},
67	{ 'name' => "level1_header",
68	'desc' => "{WordPlug.level1_header}",
69	'type' => "regexp",
70	'reqd' => "no",
71	'deft' => "" },
72	{ 'name' => "level2_header",
73	'desc' => "{WordPlug.level2_header}",
74	'type' => "regexp",
75	'reqd' => "no",
76	'deft' => "" },
77	{ 'name' => "level3_header",
78	'desc' => "{WordPlug.level3_header}",
79	'type' => "regexp",
80	'reqd' => "no",
81	'deft' => "" },
82	{ 'name' => "title_header",
83	'desc' => "{WordPlug.title_header}",
84	'type' => "regexp",
85	'reqd' => "no",
86	'deft' => "" },
87	{ 'name' => "toc_header",
88	'desc' => "{WordPlug.toc_header}",
89	'type' => "regexp",
90	'reqd' => "no",
91	'deft' => "" },
92	{ 'name' => "tof_header",
93	'desc' => "{WordPlug.tof_header}",
94	'type' => "regexp",
95	'reqd' => "no",
96	'deft' => "" },
97	{ 'name' => "extracted_word_metadata_fields",
98	'type' => "string",
99	'deft' => "Title"}];
100
101	push(@$arguments,@$ws_arg);
102	}
103
104	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
105	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
106	my @arg_array = @$inputargs;
107	my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
108
109	#this is passed through to gsConvert.pl by ConvertToPlug.pm
110	$self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
111
112	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
113	if (defined $self->{'windows_scripting'}) {
114	if (!defined $secondary_plugin_options->{'StructuredHTMLPlug'}){
115	$secondary_plugin_options->{'StructuredHTMLPlug'} = [];
116	my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
117	if ($self->{'input_encoding'} eq "auto") {
118	$self->{'input_encoding'} = "utf8";
119	$self->{'extract_language'} = 1;
120	push(@$structhtml_options,"-input_encoding", "utf8");
121	push(@$structhtml_options,"-extract_language");
122
123	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
124	# to extract these metadata fields from the HEAD META fields
125	push(@$structhtml_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
126	push(@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
127	}
128	}
129	}
130	if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
131	$secondary_plugin_options->{'HTMLPlug'} = [];
132	}
133	if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
134	$secondary_plugin_options->{'TEXTPlug'} = [];
135	}
136
137	my $html_options = $secondary_plugin_options->{'HTMLPlug'};
138	my $text_options = $secondary_plugin_options->{'TextPlug'};
139
140	# wvWare will always produce html files encoded as utf-8
141	if ($self->{'input_encoding'} eq "auto") {
142	$self->{'input_encoding'} = "utf8";
143	$self->{'extract_language'} = 1;
144	push(@$html_options,"-input_encoding", "utf8");
145	push(@$html_options,"-extract_language");
146
147	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
148	# to extract these metadata fields from the HEAD META fields
149	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
150	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
151	}
152
153	$self = bless $self, $class;
154	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
155
156	return bless $self;
157	}
158
159	sub get_default_process_exp {
160	my $self = shift (@_);
161
162	return q^(?i)\.(doc\|dot)$^;
163	}
164
165	sub convert_post_process
166	{
167	my $self = shift (@_);
168	my ($conv_filename) = @_;
169
170	my $outhandle=$self->{'outhandle'};
171
172	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
173
174	# read in file ($text will be in utf8)
175	my $text = "";
176	$self->read_file ($conv_filename, $encoding, $language, \$text);
177
178	# turn any high bytes that aren't valid utf-8 into utf-8.
179	unicode::ensure_utf8(\$text);
180
181	# Write it out again!
182	$self->utf8_write_file (\$text, $conv_filename);
183	}
184
185	sub get_file_type {
186	my $self = shift (@_);
187	my $file_type = "Word";
188	return $file_type;
189	}
190
191	# Modified to cache HTML files for efficieny reasons rather
192	# than delete all. HTML is modified not to use IE's VML.
193	# VML uses WML files, so these can be deleted.
194	sub cleanup_tmp_area {
195	my ($self) = @_;
196	if (defined $self->{'files_dir'}) {
197	my $html_files_dir = $self->{'files_dir'};
198
199	if (opendir(DIN,$html_files_dir)) {
200	my @wmz_files = grep( /\.wmz$/, readdir(DIN));
201	foreach my $f (@wmz_files) {
202	my $full_f = &util::filename_cat($html_files_dir,$f);
203	&util::rm($full_f);
204	}
205	closedir(DIN);
206	}
207	else {
208	# if HTML file has no supporting images, then no _files dir made
209	# => do nothing
210	}
211	}
212	}
213
214	# do plugin specific processing of doc_obj for HTML type
215	sub process {
216	my $self = shift (@_);
217	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
218
219	return $self->process_type("doc", $base_dir, $file, $doc_obj);
220	}
221
222	1;
223

Note: See TracBrowser for help on using the repository browser.

Download in other formats: