source: trunk/gsdl/perllib/plugins/WordPlug.pm@ 11849

Last change on this file since 11849 was 11849, checked in by kjdon, 18 years ago

added the header etc args into StructuredHTMLPlug args otherwise it barfs when WordPLug tries to pass them to it. WordPlug arg descriptions now changed to StructuredHTML strings

  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1###########################################################################
2#
3# WordPlug.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# 12/05/02 Added usage datastructure - John Thompson
26
27package WordPlug;
28
29use ConvertToPlug;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @WordPlug::ISA = ('ConvertToPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "description_tags",
44 'desc' => "{HTMLPlug.description_tags}",
45 'type' => "flag" }
46 ];
47
48my $options = { 'name' => "WordPlug",
49 'desc' => "{WordPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
60 my $ws_arg = [ { 'name' => "windows_scripting",
61 'desc' => "{WordPlug.windows_scripting}",
62 'type' => "flag",
63 'reqd' => "no" },
64 { 'name' => "delete_toc",
65 'desc' => "{StructuredHTMLPlug.delete_toc}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "level1_header",
69 'desc' => "{StructuredHTMLPlug.level1_header}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => "" },
73 { 'name' => "level2_header",
74 'desc' => "{StructuredHTMLPlug.level2_header}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "" },
78 { 'name' => "level3_header",
79 'desc' => "{StructuredHTMLPlug.level3_header}",
80 'type' => "regexp",
81 'reqd' => "no",
82 'deft' => "" },
83 { 'name' => "title_header",
84 'desc' => "{StructuredHTMLPlug.title_header}",
85 'type' => "regexp",
86 'reqd' => "no",
87 'deft' => "" },
88 { 'name' => "toc_header",
89 'desc' => "{StructuredHTMLPlug.toc_header}",
90 'type' => "regexp",
91 'reqd' => "no",
92 'deft' => "" },
93 { 'name' => "tof_header",
94 'desc' => "{StructuredHTMLPlug.tof_header}",
95 'type' => "regexp",
96 'reqd' => "no",
97 'deft' => "" },
98 { 'name' => "metadata_fields",
99 'type' => "string",
100 'deft' => "Title"}];
101
102 push(@$arguments,@$ws_arg);
103 }
104
105 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
106 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
107
108 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
109
110 if ($self->{'info_only'}) {
111 # don't worry about any options etc
112 return bless $self, $class;
113 }
114
115 #this is passed through to gsConvert.pl by ConvertToPlug.pm
116 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
117
118 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
119 if (defined $self->{'windows_scripting'}) {
120 if (!defined $secondary_plugin_options->{'StructuredHTMLPlug'}){
121 $secondary_plugin_options->{'StructuredHTMLPlug'} = [];
122 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
123 if ($self->{'input_encoding'} eq "auto") {
124 $self->{'input_encoding'} = "utf8";
125 $self->{'extract_language'} = 1;
126 #push(@$structhtml_options,"-input_encoding", "utf8");
127 push(@$structhtml_options,"-extract_language");
128 }
129
130 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
131 # to extract these metadata fields from the HEAD META fields
132 push (@$structhtml_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
133 push (@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134 push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
135 push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
136 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
137 push (@$structhtml_options, "-tof_header", $self->{'tof_header'}) if $self->{'tof_header'};
138 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
139 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
140 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
141 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
142 push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
143 }
144 }
145 if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
146 $secondary_plugin_options->{'HTMLPlug'} = [];
147 }
148 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
149 $secondary_plugin_options->{'TEXTPlug'} = [];
150 }
151
152 my $html_options = $secondary_plugin_options->{'HTMLPlug'};
153 my $text_options = $secondary_plugin_options->{'TextPlug'};
154 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
155 # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlug knows this
156 push(@$html_options,"-input_encoding", "utf8");
157
158 if ($self->{'input_encoding'} eq "auto") {
159 $self->{'input_encoding'} = "utf8";
160 $self->{'extract_language'} = 1;
161 push(@$html_options,"-extract_language");
162 }
163 if ($self->{'description_tags'} == 1) {
164 push(@$html_options, "-description_tags");
165 }
166 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
167 # to extract these metadata fields from the HEAD META fields
168 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
169 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
170
171 my $associate_tail_re = $self->{'associate_tail_re'};
172 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
173 push(@$html_options, "-associate_tail_re", $associate_tail_re);
174 push(@$text_options, "-associate_tail_re", $associate_tail_re);
175 push(@$structhtml_options, "-associate_tail_re", $associate_tail_re);
176 }
177
178 $self = bless $self, $class;
179 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
180
181 return bless $self;
182}
183
184sub get_default_process_exp {
185 my $self = shift (@_);
186
187 return q^(?i)\.(doc|dot)$^;
188}
189
190sub convert_post_process
191{
192 my $self = shift (@_);
193 my ($conv_filename) = @_;
194
195 my $outhandle=$self->{'outhandle'};
196
197 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
198
199 # read in file ($text will be in utf8)
200 my $text = "";
201 $self->read_file ($conv_filename, $encoding, $language, \$text);
202
203 # turn any high bytes that aren't valid utf-8 into utf-8.
204 #unicode::ensure_utf8(\$text);
205
206 # Write it out again!
207 #$self->utf8_write_file (\$text, $conv_filename);
208}
209
210sub get_file_type {
211 my $self = shift (@_);
212 my $file_type = "Word";
213 return $file_type;
214}
215
216# Modified to cache HTML files for efficieny reasons rather
217# than delete all. HTML is modified not to use IE's VML.
218# VML uses WML files, so these can be deleted.
219sub cleanup_tmp_area {
220 my ($self) = @_;
221 if (defined $self->{'files_dir'}) {
222 my $html_files_dir = $self->{'files_dir'};
223
224 if (opendir(DIN,$html_files_dir)) {
225 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
226 foreach my $f (@wmz_files) {
227 my $full_f = &util::filename_cat($html_files_dir,$f);
228 &util::rm($full_f);
229 }
230 closedir(DIN);
231 }
232 else {
233 # if HTML file has no supporting images, then no _files dir made
234 # => do nothing
235 }
236 }
237}
238
239# do plugin specific processing of doc_obj for HTML type
240sub process {
241 my $self = shift (@_);
242 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
243
244 return $self->process_type("doc", $base_dir, $file, $doc_obj);
245}
246
2471;
248
Note: See TracBrowser for help on using the repository browser.