source: gsdl/trunk/perllib/plugins/WordPlugin.pm@ 20790

Last change on this file since 20790 was 20790, checked in by kjdon, 12 years ago

set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)

  • Property svn:keywords set to Author Date Id Revision
File size: 8.5 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use ConvertBinaryFile;
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31sub BEGIN {
32 @WordPlugin::ISA = ('ConvertBinaryFile');
33}
34
35my $arguments =
36 [ { 'name' => "process_exp",
37 'desc' => "{BasePlugin.process_exp}",
38 'type' => "regexp",
39 'deft' => &get_default_process_exp(),
40 'reqd' => "no" },
41 { 'name' => "description_tags",
42 'desc' => "{HTMLPlugin.description_tags}",
43 'type' => "flag" }
44 ];
45
46my $options = { 'name' => "WordPlugin",
47 'desc' => "{WordPlugin.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
51 'args' => $arguments };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59 my $ws_arg = [ { 'name' => "windows_scripting",
60 'desc' => "{WordPlugin.windows_scripting}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "metadata_fields",
64 'desc' => "{WordPlugin.metadata_fields}",
65 'type' => "string",
66 'deft' => "Title" },
67 { 'name' => "level1_header",
68 'desc' => "{StructuredHTMLPlugin.level1_header}",
69 'type' => "regexp",
70 'reqd' => "no",
71 'deft' => "" },
72 { 'name' => "level2_header",
73 'desc' => "{StructuredHTMLPlugin.level2_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" },
77 { 'name' => "level3_header",
78 'desc' => "{StructuredHTMLPlugin.level3_header}",
79 'type' => "regexp",
80 'reqd' => "no",
81 'deft' => "" },
82 { 'name' => "title_header",
83 'desc' => "{StructuredHTMLPlugin.title_header}",
84 'type' => "regexp",
85 'reqd' => "no",
86 'deft' => "" },
87 { 'name' => "delete_toc",
88 'desc' => "{StructuredHTMLPlugin.delete_toc}",
89 'type' => "flag",
90 'reqd' => "no" },
91 { 'name' => "toc_header",
92 'desc' => "{StructuredHTMLPlugin.toc_header}",
93 'type' => "regexp",
94 'reqd' => "no",
95 'deft' => "" }
96 ];
97
98 push(@$arguments,@$ws_arg);
99 }
100
101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
103
104 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
105
106 if ($self->{'info_only'}) {
107 # don't worry about any options etc
108 return bless $self, $class;
109 }
110
111 $self->{'filename_extension'} = "doc";
112 $self->{'file_type'} = "Word";
113
114 #this is passed through to gsConvert.pl by ConvertBinaryFile.pm
115 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
116
117 # we always save as utf-8
118# if ($self->{'input_encoding'} eq "auto") {
119# $self->{'input_encoding'} = "utf8";
120# }
121
122 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
123 if (defined $self->{'windows_scripting'}) {
124 if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
125 $secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
126 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
127
128 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
129 # to extract these metadata fields from the HEAD META fields
130 push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
131 push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
132 push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
133 push (@$structhtml_options, "-extract_language") if $self->{'extract_language'};
134 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
135 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
136 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
137 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
138 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
139 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
140 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
141 }
142 }
143 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
144 $secondary_plugin_options->{'HTMLPlugin'} = [];
145 }
146 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
147 $secondary_plugin_options->{'TextPlugin'} = [];
148 }
149
150 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
151 my $text_options = $secondary_plugin_options->{'TextPlugin'};
152 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
153 # tell the secondary plugins that they are processing tmp files
154 push(@$html_options, "-processing_tmp_files");
155 push(@$structhtml_options, "-processing_tmp_files");
156
157 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
158 push(@$html_options,"-input_encoding", "utf8");
159 push(@$html_options,"-extract_language") if $self->{'extract_language'};
160 push(@$html_options, "-description_tags") if $self->{'description_tags'};
161
162 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
163 # to extract these metadata fields from the HEAD META fields
164 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
165 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
166
167 my $associate_tail_re = $self->{'associate_tail_re'};
168 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
169 push(@$html_options, "-associate_tail_re", $associate_tail_re);
170 push(@$text_options, "-associate_tail_re", $associate_tail_re);
171 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options;
172 }
173
174 push(@$html_options, "-file_rename_method", "none");
175 push(@$text_options, "-file_rename_method", "none");
176 push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options;
177
178 $self = bless $self, $class;
179 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
180
181 return bless $self;
182}
183
184sub get_default_process_exp {
185 my $self = shift (@_);
186
187 return q^(?i)\.(doc|dot)$^;
188}
189
190sub convert_post_process_old
191{
192 my $self = shift (@_);
193 my ($conv_filename) = @_;
194
195 my $outhandle=$self->{'outhandle'};
196
197 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
198
199 # read in file ($text will be in utf8)
200 my $text = "";
201 $self->read_file ($conv_filename, $encoding, $language, \$text);
202
203 # turn any high bytes that aren't valid utf-8 into utf-8.
204 #unicode::ensure_utf8(\$text);
205
206 # Write it out again!
207 #$self->utf8_write_file (\$text, $conv_filename);
208}
209
210# Modified to cache HTML files for efficieny reasons rather
211# than delete all. HTML is modified not to use IE's VML.
212# VML uses WML files, so these can be deleted.
213sub cleanup_tmp_area {
214 my ($self) = @_;
215 if (defined $self->{'files_dir'}) {
216 my $html_files_dir = $self->{'files_dir'};
217
218 if (opendir(DIN,$html_files_dir)) {
219 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
220 foreach my $f (@wmz_files) {
221 my $full_f = &util::filename_cat($html_files_dir,$f);
222 &util::rm($full_f);
223 }
224 closedir(DIN);
225 }
226 else {
227 # if HTML file has no supporting images, then no _files dir made
228 # => do nothing
229 }
230 }
231}
232
233
2341;
235
Note: See TracBrowser for help on using the repository browser.