source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22705

Last change on this file since 22705 was 22705, checked in by davidb, 14 years ago

User of AutoloadConverterScripting expanded to encompass PowerPoint and Excel. No longer need OOConvertBinaryFile and PBConvertBinaryFile

  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoloadConverterScripting;
34
35@WordPlugin::ISA = ('AutoloadConverterScripting');
36
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasePlugin.process_exp}",
41 'type' => "regexp",
42 'deft' => &get_default_process_exp(),
43 'reqd' => "no" },
44 { 'name' => "description_tags",
45 'desc' => "{HTMLPlugin.description_tags}",
46 'type' => "flag" }
47 ];
48
49
50my $opt_windows_args = [ { 'name' => "windows_scripting",
51 'desc' => "{WordPlugin.windows_scripting}",
52 'type' => "flag",
53 'reqd' => "no" } ];
54
55my $opt_office_args = [ { 'name' => "metadata_fields",
56 'desc' => "{WordPlugin.metadata_fields}",
57 'type' => "string",
58 'deft' => "Title" },
59 { 'name' => "level1_header",
60 'desc' => "{StructuredHTMLPlugin.level1_header}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "" },
64 { 'name' => "level2_header",
65 'desc' => "{StructuredHTMLPlugin.level2_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level3_header",
70 'desc' => "{StructuredHTMLPlugin.level3_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "title_header",
75 'desc' => "{StructuredHTMLPlugin.title_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "delete_toc",
80 'desc' => "{StructuredHTMLPlugin.delete_toc}",
81 'type' => "flag",
82 'reqd' => "no" },
83 { 'name' => "toc_header",
84 'desc' => "{StructuredHTMLPlugin.toc_header}",
85 'type' => "regexp",
86 'reqd' => "no",
87 'deft' => "" } ];
88
89
90my $options = { 'name' => "WordPlugin",
91 'desc' => "{WordPlugin.desc}",
92 'abstract' => "no",
93 'inherits' => "yes",
94 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
95 'args' => $arguments };
96
97sub new {
98 my ($class) = shift (@_);
99 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100 push(@$pluginlist, $class);
101
102 my $office_capable = 0;
103 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
104 push(@$arguments,@$opt_windows_args);
105 $office_capable = 1;
106 }
107 if ($AutoloadConverterScripting::openoffice_ext_working) {
108 $office_capable = 1;
109 }
110 # these office args apply to windows scripting or to openoffice scripting
111 if ($office_capable) {
112 push(@$arguments,@$opt_office_args);
113 }
114
115 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
116 push(@{$hashArgOptLists->{"OptList"}},$options);
117
118 my $self = new AutoloadConverterScripting("OpenOfficeConverter",$pluginlist, $inputargs, $hashArgOptLists);
119
120 if ($self->{'info_only'}) {
121 # don't worry about any options etc
122 return bless $self, $class;
123 }
124
125 $self->{'filename_extension'} = "doc";
126 $self->{'file_type'} = "Word";
127
128 my $outhandle = $self->{'outhandle'};
129
130 if ($self->{'windows_scripting'}) {
131 $self->{'convert_options'} = "-windows_scripting";
132 $self->{'office_scripting'} = 1;
133 }
134 if ($self->{'openoffice_scripting'}) {
135 if ($self->{'windows_scripting'}) {
136 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
137 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
138 $self->{'openoffice_scripting'} = 0;
139 }
140 else {
141 $self->{'office_scripting'} = 1;
142 }
143 }
144
145 # check convert_to
146 if ($self->{'convert_to'} eq "auto") {
147 $self->{'convert_to'} = "html";
148 }
149 # windows or open office scripting, outputs structuredHTML
150 if (defined $self->{'office_scripting'}) {
151 $self->{'convert_to'} = "structuredhtml";
152 }
153
154 # set convert_to_plugin and convert_to_ext
155 $self->set_standard_convert_settings();
156
157 my $secondary_plugin_name = $self->{'convert_to_plugin'};
158 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
159
160 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
161 $secondary_plugin_options->{$secondary_plugin_name} = [];
162 }
163 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
164
165 # following title_sub removes "Page 1" and a leading
166 # "1", which is often the page number at the top of the page. Bad Luck
167 # if your document title actually starts with "1 " - is there a better way?
168 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
169
170 my $associate_tail_re = $self->{'associate_tail_re'};
171 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
172 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
173 }
174 push(@$specific_options, "-file_rename_method", "none");
175
176 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
177 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
178 # to extract these metadata fields from the HEAD META fields
179 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
180 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
181 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
182 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
183 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
184 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
185 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
186 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
187 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
188 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
189 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
190 push(@$specific_options, "-processing_tmp_files");
191
192 }
193
194 elsif ($secondary_plugin_name eq "HTMLPlugin") {
195 push(@$specific_options, "-processing_tmp_files");
196 push(@$specific_options,"-input_encoding", "utf8");
197 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
198 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
199 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
200 # to extract these metadata fields from the HEAD META fields
201 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
202 }
203
204 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
205
206 return bless $self, $class;
207}
208
209sub get_default_process_exp {
210 my $self = shift (@_);
211 if ($AutoloadConverterScripting::openoffice_ext_working) {
212 return q^(?i)\.(doc|dot|docx|odt)$^;
213 }
214 return q^(?i)\.(doc|dot)$^;
215}
216
217sub convert_post_process_old
218{
219 my $self = shift (@_);
220 my ($conv_filename) = @_;
221
222 my $outhandle=$self->{'outhandle'};
223
224 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
225
226 # read in file ($text will be in utf8)
227 my $text = "";
228 $self->read_file ($conv_filename, $encoding, $language, \$text);
229
230 # turn any high bytes that aren't valid utf-8 into utf-8.
231 #unicode::ensure_utf8(\$text);
232
233 # Write it out again!
234 #$self->utf8_write_file (\$text, $conv_filename);
235}
236
237# Modified to cache HTML files for efficieny reasons rather
238# than delete all. HTML is modified not to use IE's VML.
239# VML uses WML files, so these can be deleted.
240sub cleanup_tmp_area {
241 my ($self) = @_;
242 if (defined $self->{'files_dir'}) {
243 my $html_files_dir = $self->{'files_dir'};
244
245 if (opendir(DIN,$html_files_dir)) {
246 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
247 foreach my $f (@wmz_files) {
248 my $full_f = &util::filename_cat($html_files_dir,$f);
249 &util::rm($full_f);
250 }
251 closedir(DIN);
252 }
253 else {
254 # if HTML file has no supporting images, then no _files dir made
255 # => do nothing
256 }
257 }
258}
259
260
2611;
262
Note: See TracBrowser for help on using the repository browser.