source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm@ 22658

Last change on this file since 22658 was 22641, checked in by kjdon, 14 years ago

now inherits from OOConvertBinaryFile. still a couple of things to iron out

  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use OOConvertBinaryFile;
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32no strict 'subs';
33use gsprintf 'gsprintf';
34
35sub BEGIN {
36 @WordPlugin::ISA = ('OOConvertBinaryFile');
37}
38
39my $arguments =
40 [ { 'name' => "process_exp",
41 'desc' => "{BasePlugin.process_exp}",
42 'type' => "regexp",
43 'deft' => &get_default_process_exp(),
44 'reqd' => "no" },
45 { 'name' => "description_tags",
46 'desc' => "{HTMLPlugin.description_tags}",
47 'type' => "flag" }
48 ];
49
50
51my $opt_windows_args = [ { 'name' => "windows_scripting",
52 'desc' => "{WordPlugin.windows_scripting}",
53 'type' => "flag",
54 'reqd' => "no" } ];
55
56my $opt_office_args = [ { 'name' => "metadata_fields",
57 'desc' => "{WordPlugin.metadata_fields}",
58 'type' => "string",
59 'deft' => "Title" },
60 { 'name' => "level1_header",
61 'desc' => "{StructuredHTMLPlugin.level1_header}",
62 'type' => "regexp",
63 'reqd' => "no",
64 'deft' => "" },
65 { 'name' => "level2_header",
66 'desc' => "{StructuredHTMLPlugin.level2_header}",
67 'type' => "regexp",
68 'reqd' => "no",
69 'deft' => "" },
70 { 'name' => "level3_header",
71 'desc' => "{StructuredHTMLPlugin.level3_header}",
72 'type' => "regexp",
73 'reqd' => "no",
74 'deft' => "" },
75 { 'name' => "title_header",
76 'desc' => "{StructuredHTMLPlugin.title_header}",
77 'type' => "regexp",
78 'reqd' => "no",
79 'deft' => "" },
80 { 'name' => "delete_toc",
81 'desc' => "{StructuredHTMLPlugin.delete_toc}",
82 'type' => "flag",
83 'reqd' => "no" },
84 { 'name' => "toc_header",
85 'desc' => "{StructuredHTMLPlugin.toc_header}",
86 'type' => "regexp",
87 'reqd' => "no",
88 'deft' => "" } ];
89
90
91my $options = { 'name' => "WordPlugin",
92 'desc' => "{WordPlugin.desc}",
93 'abstract' => "no",
94 'inherits' => "yes",
95 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
96 'args' => $arguments };
97
98sub new {
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 my $office_capable = 0;
104 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
105 push(@$arguments,@$opt_windows_args);
106 $office_capable = 1;
107 }
108 if ($OOConvertBinaryFile::openoffice_ext_working) {
109 $office_capable = 1;
110 }
111 # these office args apply to windows scripting or to openoffice scripting
112 if ($office_capable) {
113 push(@$arguments,@$opt_office_args);
114 }
115
116 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
117 push(@{$hashArgOptLists->{"OptList"}},$options);
118
119 my $self = new OOConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
120
121 if ($self->{'info_only'}) {
122 # don't worry about any options etc
123 return bless $self, $class;
124 }
125
126 $self->{'filename_extension'} = "doc";
127 $self->{'file_type'} = "Word";
128
129 my $outhandle = $self->{'outhandle'};
130
131 if ($self->{'windows_scripting'}) {
132 $self->{'convert_options'} = "-windows_scripting";
133 $self->{'office_scripting'} = 1;
134 }
135 if ($self->{'openoffice_scripting'}) {
136 if ($self->{'windows_scripting'}) {
137 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
138 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
139 $self->{'openoffice_scripting'} = 0;
140 }
141 else {
142 $self->{'office_scripting'} = 1;
143 }
144 }
145
146 # check convert_to
147 if ($self->{'convert_to'} eq "auto") {
148 $self->{'convert_to'} = "html";
149 }
150 # windows or open office scripting, outputs structuredHTML
151 if (defined $self->{'office_scripting'}) {
152 $self->{'convert_to'} = "structuredhtml";
153 }
154
155 # set convert_to_plugin and convert_to_ext
156 $self->set_standard_convert_settings();
157
158 my $secondary_plugin_name = $self->{'convert_to_plugin'};
159 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
160
161 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
162 $secondary_plugin_options->{$secondary_plugin_name} = [];
163 }
164 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
165
166 # following title_sub removes "Page 1" and a leading
167 # "1", which is often the page number at the top of the page. Bad Luck
168 # if your document title actually starts with "1 " - is there a better way?
169 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
170
171 my $associate_tail_re = $self->{'associate_tail_re'};
172 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
173 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
174 }
175 push(@$specific_options, "-file_rename_method", "none");
176
177 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
178 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
179 # to extract these metadata fields from the HEAD META fields
180 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
181 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
182 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
183 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
184 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
185 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
186 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
187 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
188 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
189 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
190 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
191 push(@$specific_options, "-processing_tmp_files");
192
193 }
194
195 elsif ($secondary_plugin_name eq "HTMLPlugin") {
196 push(@$specific_options, "-processing_tmp_files");
197 push(@$specific_options,"-input_encoding", "utf8");
198 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
199 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
200 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
201 # to extract these metadata fields from the HEAD META fields
202 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
203 }
204
205 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
206
207 return bless $self, $class;
208}
209
210sub get_default_process_exp {
211 my $self = shift (@_);
212 if ($OOConvertBinaryFile::openoffice_ext_working) {
213 return q^(?i)\.(doc|dot|docx|odt)$^;
214 }
215 return q^(?i)\.(doc|dot)$^;
216}
217
218sub convert_post_process_old
219{
220 my $self = shift (@_);
221 my ($conv_filename) = @_;
222
223 my $outhandle=$self->{'outhandle'};
224
225 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
226
227 # read in file ($text will be in utf8)
228 my $text = "";
229 $self->read_file ($conv_filename, $encoding, $language, \$text);
230
231 # turn any high bytes that aren't valid utf-8 into utf-8.
232 #unicode::ensure_utf8(\$text);
233
234 # Write it out again!
235 #$self->utf8_write_file (\$text, $conv_filename);
236}
237
238# Modified to cache HTML files for efficieny reasons rather
239# than delete all. HTML is modified not to use IE's VML.
240# VML uses WML files, so these can be deleted.
241sub cleanup_tmp_area {
242 my ($self) = @_;
243 if (defined $self->{'files_dir'}) {
244 my $html_files_dir = $self->{'files_dir'};
245
246 if (opendir(DIN,$html_files_dir)) {
247 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
248 foreach my $f (@wmz_files) {
249 my $full_f = &util::filename_cat($html_files_dir,$f);
250 &util::rm($full_f);
251 }
252 closedir(DIN);
253 }
254 else {
255 # if HTML file has no supporting images, then no _files dir made
256 # => do nothing
257 }
258 }
259}
260
261
2621;
263
Note: See TracBrowser for help on using the repository browser.