source: trunk/gsdl/perllib/plugins/WordPlug.pm@ 11851

Last change on this file since 11851 was 11851, checked in by kjdon, 18 years ago

added modegli=3 to delete_toc option so that it only shows up if the others do

  • Property svn:keywords set to Author Date Id Revision
File size: 8.8 KB
Line 
1###########################################################################
2#
3# WordPlug.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# 12/05/02 Added usage datastructure - John Thompson
26
27package WordPlug;
28
29use ConvertToPlug;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @WordPlug::ISA = ('ConvertToPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "description_tags",
44 'desc' => "{HTMLPlug.description_tags}",
45 'type' => "flag" }
46 ];
47
48my $options = { 'name' => "WordPlug",
49 'desc' => "{WordPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
60 my $ws_arg = [ { 'name' => "windows_scripting",
61 'desc' => "{WordPlug.windows_scripting}",
62 'type' => "flag",
63 'reqd' => "no" },
64 { 'name' => "delete_toc",
65 'desc' => "{StructuredHTMLPlug.delete_toc}",
66 'type' => "flag",
67 'reqd' => "no",
68 # set the mode so this doesn't show up unless
69 # all the following ones do
70 'modegli' => "3"},
71 { 'name' => "level1_header",
72 'desc' => "{StructuredHTMLPlug.level1_header}",
73 'type' => "regexp",
74 'reqd' => "no",
75 'deft' => "" },
76 { 'name' => "level2_header",
77 'desc' => "{StructuredHTMLPlug.level2_header}",
78 'type' => "regexp",
79 'reqd' => "no",
80 'deft' => "" },
81 { 'name' => "level3_header",
82 'desc' => "{StructuredHTMLPlug.level3_header}",
83 'type' => "regexp",
84 'reqd' => "no",
85 'deft' => "" },
86 { 'name' => "title_header",
87 'desc' => "{StructuredHTMLPlug.title_header}",
88 'type' => "regexp",
89 'reqd' => "no",
90 'deft' => "" },
91 { 'name' => "toc_header",
92 'desc' => "{StructuredHTMLPlug.toc_header}",
93 'type' => "regexp",
94 'reqd' => "no",
95 'deft' => "" },
96 { 'name' => "tof_header",
97 'desc' => "{StructuredHTMLPlug.tof_header}",
98 'type' => "regexp",
99 'reqd' => "no",
100 'deft' => "" },
101 { 'name' => "metadata_fields",
102 'type' => "string",
103 'deft' => "Title"}];
104
105 push(@$arguments,@$ws_arg);
106 }
107
108 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
109 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
110
111 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
112
113 if ($self->{'info_only'}) {
114 # don't worry about any options etc
115 return bless $self, $class;
116 }
117
118 #this is passed through to gsConvert.pl by ConvertToPlug.pm
119 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
120
121 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
122 if (defined $self->{'windows_scripting'}) {
123 if (!defined $secondary_plugin_options->{'StructuredHTMLPlug'}){
124 $secondary_plugin_options->{'StructuredHTMLPlug'} = [];
125 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
126 if ($self->{'input_encoding'} eq "auto") {
127 $self->{'input_encoding'} = "utf8";
128 $self->{'extract_language'} = 1;
129 #push(@$structhtml_options,"-input_encoding", "utf8");
130 push(@$structhtml_options,"-extract_language");
131 }
132
133 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
134 # to extract these metadata fields from the HEAD META fields
135 push (@$structhtml_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
136 push (@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
137 push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
138 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
139 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
140 push (@$structhtml_options, "-tof_header", $self->{'tof_header'}) if $self->{'tof_header'};
141 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
142 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
143 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
144 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
145 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
146 }
147 }
148 if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
149 $secondary_plugin_options->{'HTMLPlug'} = [];
150 }
151 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
152 $secondary_plugin_options->{'TEXTPlug'} = [];
153 }
154
155 my $html_options = $secondary_plugin_options->{'HTMLPlug'};
156 my $text_options = $secondary_plugin_options->{'TextPlug'};
157 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
158 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlug knows this
159 push(@$html_options,"-input_encoding", "utf8");
160
161 if ($self->{'input_encoding'} eq "auto") {
162 $self->{'input_encoding'} = "utf8";
163 $self->{'extract_language'} = 1;
164 push(@$html_options,"-extract_language");
165 }
166 if ($self->{'description_tags'} == 1) {
167 push(@$html_options, "-description_tags");
168 }
169 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
170 # to extract these metadata fields from the HEAD META fields
171 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
172 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
173
174 my $associate_tail_re = $self->{'associate_tail_re'};
175 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
176 push(@$html_options, "-associate_tail_re", $associate_tail_re);
177 push(@$text_options, "-associate_tail_re", $associate_tail_re);
178 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re);
179 }
180
181 $self = bless $self, $class;
182 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
183
184 return bless $self;
185}
186
187sub get_default_process_exp {
188 my $self = shift (@_);
189
190 return q^(?i)\.(doc|dot)$^;
191}
192
193sub convert_post_process
194{
195 my $self = shift (@_);
196 my ($conv_filename) = @_;
197
198 my $outhandle=$self->{'outhandle'};
199
200 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
201
202 # read in file ($text will be in utf8)
203 my $text = "";
204 $self->read_file ($conv_filename, $encoding, $language, \$text);
205
206 # turn any high bytes that aren't valid utf-8 into utf-8.
207 #unicode::ensure_utf8(\$text);
208
209 # Write it out again!
210 #$self->utf8_write_file (\$text, $conv_filename);
211}
212
213sub get_file_type {
214 my $self = shift (@_);
215 my $file_type = "Word";
216 return $file_type;
217}
218
219# Modified to cache HTML files for efficieny reasons rather
220# than delete all. HTML is modified not to use IE's VML.
221# VML uses WML files, so these can be deleted.
222sub cleanup_tmp_area {
223 my ($self) = @_;
224 if (defined $self->{'files_dir'}) {
225 my $html_files_dir = $self->{'files_dir'};
226
227 if (opendir(DIN,$html_files_dir)) {
228 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
229 foreach my $f (@wmz_files) {
230 my $full_f = &util::filename_cat($html_files_dir,$f);
231 &util::rm($full_f);
232 }
233 closedir(DIN);
234 }
235 else {
236 # if HTML file has no supporting images, then no _files dir made
237 # => do nothing
238 }
239 }
240}
241
242# do plugin specific processing of doc_obj for HTML type
243sub process {
244 my $self = shift (@_);
245 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
246
247 return $self->process_type("doc", $base_dir, $file, $doc_obj);
248}
249
2501;
251
Note: See TracBrowser for help on using the repository browser.