source: gsdl/trunk/perllib/plugins/WordPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# 12/05/02 Added usage datastructure - John Thompson
26
27package WordPlugin;
28
29use ConvertBinaryFile;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @WordPlugin::ISA = ('ConvertBinaryFile');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasePlugin.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "description_tags",
44 'desc' => "{HTMLPlugin.description_tags}",
45 'type' => "flag" }
46 ];
47
48my $options = { 'name' => "WordPlugin",
49 'desc' => "{WordPlugin.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
53 'args' => $arguments };
54
55sub new {
56 my ($class) = shift (@_);
57 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
58 push(@$pluginlist, $class);
59
60 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
61 my $ws_arg = [ { 'name' => "windows_scripting",
62 'desc' => "{WordPlugin.windows_scripting}",
63 'type' => "flag",
64 'reqd' => "no" },
65 { 'name' => "metadata_fields",
66 'type' => "string",
67 'deft' => "Title" },
68 { 'name' => "level1_header",
69 'desc' => "{StructuredHTMLPlugin.level1_header}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => "" },
73 { 'name' => "level2_header",
74 'desc' => "{StructuredHTMLPlugin.level2_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" },
78 { 'name' => "level3_header",
79 'desc' => "{StructuredHTMLPlugin.level3_header}",
80 'type' => "regexp",
81 'reqd' => "no",
82 'deft' => "" },
83 { 'name' => "title_header",
84 'desc' => "{StructuredHTMLPlugin.title_header}",
85 'type' => "regexp",
86 'reqd' => "no",
87 'deft' => "" },
88 { 'name' => "delete_toc",
89 'desc' => "{StructuredHTMLPlugin.delete_toc}",
90 'type' => "flag",
91 'reqd' => "no",
92 # set the mode so this doesn't show up unless
93 # all the following ones do
94 'modegli' => "3"},
95 { 'name' => "toc_header",
96 'desc' => "{StructuredHTMLPlugin.toc_header}",
97 'type' => "regexp",
98 'reqd' => "no",
99 'deft' => "" }
100 ];
101
102 push(@$arguments,@$ws_arg);
103 }
104
105 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
106 push(@{$hashArgOptLists->{"OptList"}},$options);
107
108 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
109
110 if ($self->{'info_only'}) {
111 # don't worry about any options etc
112 return bless $self, $class;
113 }
114
115 $self->{'filename_extension'} = "doc";
116 $self->{'file_type'} = "Word";
117
118 #this is passed through to gsConvert.pl by ConvertBinaryFile.pm
119 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
120
121 # we always save as utf-8
122 if ($self->{'input_encoding'} eq "auto") {
123 $self->{'input_encoding'} = "utf8";
124 }
125
126 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
127 if (defined $self->{'windows_scripting'}) {
128 if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
129 $secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
130 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
131
132 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
133 # to extract these metadata fields from the HEAD META fields
134 push (@$structhtml_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
135 push (@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
136 push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
137 push(@$structhtml_options,"-extract_language") if $self->{'extract_language'};
138 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
139 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
140 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
141 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
142 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
143 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
144 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
145 }
146 }
147 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
148 $secondary_plugin_options->{'HTMLPlugin'} = [];
149 }
150 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
151 $secondary_plugin_options->{'TextPlugin'} = [];
152 }
153
154 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
155 my $text_options = $secondary_plugin_options->{'TextPlugin'};
156 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
157 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
158 push(@$html_options,"-input_encoding", "utf8");
159 push(@$html_options,"-extract_language") if $self->{'extract_language'};
160 push(@$html_options, "-description_tags") if $self->{'description_tags'};
161
162 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
163 # to extract these metadata fields from the HEAD META fields
164 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
165 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
166
167 my $associate_tail_re = $self->{'associate_tail_re'};
168 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
169 push(@$html_options, "-associate_tail_re", $associate_tail_re);
170 push(@$text_options, "-associate_tail_re", $associate_tail_re);
171 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re);
172 }
173
174 $self = bless $self, $class;
175 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
176
177 return bless $self;
178}
179
180sub get_default_process_exp {
181 my $self = shift (@_);
182
183 return q^(?i)\.(doc|dot)$^;
184}
185
186sub convert_post_process_old
187{
188 my $self = shift (@_);
189 my ($conv_filename) = @_;
190
191 my $outhandle=$self->{'outhandle'};
192
193 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
194
195 # read in file ($text will be in utf8)
196 my $text = "";
197 $self->read_file ($conv_filename, $encoding, $language, \$text);
198
199 # turn any high bytes that aren't valid utf-8 into utf-8.
200 #unicode::ensure_utf8(\$text);
201
202 # Write it out again!
203 #$self->utf8_write_file (\$text, $conv_filename);
204}
205
206# Modified to cache HTML files for efficieny reasons rather
207# than delete all. HTML is modified not to use IE's VML.
208# VML uses WML files, so these can be deleted.
209sub cleanup_tmp_area {
210 my ($self) = @_;
211 if (defined $self->{'files_dir'}) {
212 my $html_files_dir = $self->{'files_dir'};
213
214 if (opendir(DIN,$html_files_dir)) {
215 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
216 foreach my $f (@wmz_files) {
217 my $full_f = &util::filename_cat($html_files_dir,$f);
218 &util::rm($full_f);
219 }
220 closedir(DIN);
221 }
222 else {
223 # if HTML file has no supporting images, then no _files dir made
224 # => do nothing
225 }
226 }
227}
228
229
2301;
231
Note: See TracBrowser for help on using the repository browser.