source: gsdl/trunk/perllib/plugins/WordPlugin.pm@ 18406

Last change on this file since 18406 was 18406, checked in by ak19, 15 years ago

Modified srcreplaceable plugins (plugins which operate on docs where the source file can be replaced with their converted htmls) to set the file_rename_method to none for secondary plugins (for Text, HTML, and PagedImage plugins) so that the file is not renamed several times.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use ConvertBinaryFile;
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31sub BEGIN {
32 @WordPlugin::ISA = ('ConvertBinaryFile');
33}
34
35my $arguments =
36 [ { 'name' => "process_exp",
37 'desc' => "{BasePlugin.process_exp}",
38 'type' => "regexp",
39 'deft' => &get_default_process_exp(),
40 'reqd' => "no" },
41 { 'name' => "description_tags",
42 'desc' => "{HTMLPlugin.description_tags}",
43 'type' => "flag" }
44 ];
45
46my $options = { 'name' => "WordPlugin",
47 'desc' => "{WordPlugin.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
51 'args' => $arguments };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59 my $ws_arg = [ { 'name' => "windows_scripting",
60 'desc' => "{WordPlugin.windows_scripting}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "metadata_fields",
64 'desc' => "{WordPlugin.metadata_fields}",
65 'type' => "string",
66 'deft' => "Title" },
67 { 'name' => "level1_header",
68 'desc' => "{StructuredHTMLPlugin.level1_header}",
69 'type' => "regexp",
70 'reqd' => "no",
71 'deft' => "" },
72 { 'name' => "level2_header",
73 'desc' => "{StructuredHTMLPlugin.level2_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" },
77 { 'name' => "level3_header",
78 'desc' => "{StructuredHTMLPlugin.level3_header}",
79 'type' => "regexp",
80 'reqd' => "no",
81 'deft' => "" },
82 { 'name' => "title_header",
83 'desc' => "{StructuredHTMLPlugin.title_header}",
84 'type' => "regexp",
85 'reqd' => "no",
86 'deft' => "" },
87 { 'name' => "delete_toc",
88 'desc' => "{StructuredHTMLPlugin.delete_toc}",
89 'type' => "flag",
90 'reqd' => "no",
91 # set the mode so this doesn't show up unless
92 # all the following ones do
93 'modegli' => "3"},
94 { 'name' => "toc_header",
95 'desc' => "{StructuredHTMLPlugin.toc_header}",
96 'type' => "regexp",
97 'reqd' => "no",
98 'deft' => "" }
99 ];
100
101 push(@$arguments,@$ws_arg);
102 }
103
104 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
105 push(@{$hashArgOptLists->{"OptList"}},$options);
106
107 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
108
109 if ($self->{'info_only'}) {
110 # don't worry about any options etc
111 return bless $self, $class;
112 }
113
114 $self->{'filename_extension'} = "doc";
115 $self->{'file_type'} = "Word";
116
117 #this is passed through to gsConvert.pl by ConvertBinaryFile.pm
118 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
119
120 # we always save as utf-8
121# if ($self->{'input_encoding'} eq "auto") {
122# $self->{'input_encoding'} = "utf8";
123# }
124
125 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
126 if (defined $self->{'windows_scripting'}) {
127 if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
128 $secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
129 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
130
131 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
132 # to extract these metadata fields from the HEAD META fields
133 push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
134 push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
135 push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
136 push (@$structhtml_options, "-extract_language") if $self->{'extract_language'};
137 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
138 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
139 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
140 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
141 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
142 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
143 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
144 }
145 }
146 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
147 $secondary_plugin_options->{'HTMLPlugin'} = [];
148 }
149 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
150 $secondary_plugin_options->{'TextPlugin'} = [];
151 }
152
153 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
154 my $text_options = $secondary_plugin_options->{'TextPlugin'};
155 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
156 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
157 push(@$html_options,"-input_encoding", "utf8");
158 push(@$html_options,"-extract_language") if $self->{'extract_language'};
159 push(@$html_options, "-description_tags") if $self->{'description_tags'};
160
161 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
162 # to extract these metadata fields from the HEAD META fields
163 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
164 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
165
166 my $associate_tail_re = $self->{'associate_tail_re'};
167 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
168 push(@$html_options, "-associate_tail_re", $associate_tail_re);
169 push(@$text_options, "-associate_tail_re", $associate_tail_re);
170 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options;
171 }
172
173 push(@$html_options, "-file_rename_method", "none");
174 push(@$text_options, "-file_rename_method", "none");
175 push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options;
176
177 $self = bless $self, $class;
178 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
179
180 return bless $self;
181}
182
183sub get_default_process_exp {
184 my $self = shift (@_);
185
186 return q^(?i)\.(doc|dot)$^;
187}
188
189sub convert_post_process_old
190{
191 my $self = shift (@_);
192 my ($conv_filename) = @_;
193
194 my $outhandle=$self->{'outhandle'};
195
196 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
197
198 # read in file ($text will be in utf8)
199 my $text = "";
200 $self->read_file ($conv_filename, $encoding, $language, \$text);
201
202 # turn any high bytes that aren't valid utf-8 into utf-8.
203 #unicode::ensure_utf8(\$text);
204
205 # Write it out again!
206 #$self->utf8_write_file (\$text, $conv_filename);
207}
208
209# Modified to cache HTML files for efficieny reasons rather
210# than delete all. HTML is modified not to use IE's VML.
211# VML uses WML files, so these can be deleted.
212sub cleanup_tmp_area {
213 my ($self) = @_;
214 if (defined $self->{'files_dir'}) {
215 my $html_files_dir = $self->{'files_dir'};
216
217 if (opendir(DIN,$html_files_dir)) {
218 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
219 foreach my $f (@wmz_files) {
220 my $full_f = &util::filename_cat($html_files_dir,$f);
221 &util::rm($full_f);
222 }
223 closedir(DIN);
224 }
225 else {
226 # if HTML file has no supporting images, then no _files dir made
227 # => do nothing
228 }
229 }
230}
231
232
2331;
234
Note: See TracBrowser for help on using the repository browser.