source: trunk/gsdl/perllib/plugins/WordPlug.pm@ 10478

Last change on this file since 10478 was 10441, checked in by chi, 19 years ago

Modifications for pushing required option and argument lists to StructuredHTMLPlug, also comments out
ensure_utf8() as it seems messing up the document text with UTF8 encoding.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1###########################################################################
2#
3# WordPlug.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# 12/05/02 Added usage datastructure - John Thompson
26
27package WordPlug;
28
29use ConvertToPlug;
30require StructuredHTMLPlug;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 #@WordPlug::ISA = ('ConvertToPlug','StructuredHTMLPlug');
36 @WordPlug::ISA = ('ConvertToPlug');
37}
38
39my $arguments =
40 [ { 'name' => "process_exp",
41 'desc' => "{BasPlug.process_exp}",
42 'type' => "regexp",
43 'deft' => &get_default_process_exp(),
44 'reqd' => "no" }
45 ];
46
47my $options = { 'name' => "WordPlug",
48 'desc' => "{WordPlug.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59 my $ws_arg = [ { 'name' => "windows_scripting",
60 'desc' => "{WordPlug.windows_scripting}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "checkout_toc",
64 'desc' => "{WordPlug.checkout_toc}",
65 'type' => "flag",
66 'reqd' => "no"},
67 { 'name' => "level1_header",
68 'desc' => "{WordPlug.level1_header}",
69 'type' => "regexp",
70 'reqd' => "no",
71 'deft' => "" },
72 { 'name' => "level2_header",
73 'desc' => "{WordPlug.level2_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" },
77 { 'name' => "level3_header",
78 'desc' => "{WordPlug.level3_header}",
79 'type' => "regexp",
80 'reqd' => "no",
81 'deft' => "" },
82 { 'name' => "title_header",
83 'desc' => "{WordPlug.title_header}",
84 'type' => "regexp",
85 'reqd' => "no",
86 'deft' => "" },
87 { 'name' => "toc_header",
88 'desc' => "{WordPlug.toc_header}",
89 'type' => "regexp",
90 'reqd' => "no",
91 'deft' => "" },
92 { 'name' => "tof_header",
93 'desc' => "{WordPlug.tof_header}",
94 'type' => "regexp",
95 'reqd' => "no",
96 'deft' => "" },
97 { 'name' => "extracted_word_metadata_fields",
98 'type' => "string",
99 'deft' => "Title"}];
100
101 push(@$arguments,@$ws_arg);
102 }
103
104 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
105 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
106
107 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
108
109 #this is passed through to gsConvert.pl by ConvertToPlug.pm
110 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
111
112 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
113 if (defined $self->{'windows_scripting'}) {
114 if (!defined $secondary_plugin_options->{'StructuredHTMLPlug'}){
115 $secondary_plugin_options->{'StructuredHTMLPlug'} = [];
116 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
117 if ($self->{'input_encoding'} eq "auto") {
118 $self->{'input_encoding'} = "utf8";
119 $self->{'extract_language'} = 1;
120 push(@$structhtml_options,"-input_encoding", "utf8");
121 push(@$structhtml_options,"-extract_language");
122 }
123
124 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
125 # to extract these metadata fields from the HEAD META fields
126 push (@$structhtml_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
127 push (@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
128 push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
129 push (@$structhtml_options, "-checkout_toc") if $self->{'checkout_toc'};
130 push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
131 push (@$structhtml_options, "-tof_header", $self->{'tof_header'}) if $self->{'tof_header'};
132 push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
133 push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
134 push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
135 push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
136 push (@$structhtml_options, "-extracted_word_metadata_fields", $self->{'extracted_word_metadata_fields'}) if $self->{'extracted_word_metadata_fields'};
137 }
138 }
139 if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
140 $secondary_plugin_options->{'HTMLPlug'} = [];
141 }
142 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
143 $secondary_plugin_options->{'TEXTPlug'} = [];
144 }
145
146 my $html_options = $secondary_plugin_options->{'HTMLPlug'};
147 my $text_options = $secondary_plugin_options->{'TextPlug'};
148
149 # wvWare will always produce html files encoded as utf-8
150 if ($self->{'input_encoding'} eq "auto") {
151 $self->{'input_encoding'} = "utf8";
152 $self->{'extract_language'} = 1;
153 push(@$html_options,"-input_encoding", "utf8");
154 push(@$html_options,"-extract_language");
155 }
156
157 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
158 # to extract these metadata fields from the HEAD META fields
159 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
160 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
161
162 $self = bless $self, $class;
163 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
164
165 return bless $self;
166}
167
168sub get_default_process_exp {
169 my $self = shift (@_);
170
171 return q^(?i)\.(doc|dot)$^;
172}
173
174sub convert_post_process
175{
176 my $self = shift (@_);
177 my ($conv_filename) = @_;
178
179 my $outhandle=$self->{'outhandle'};
180
181 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
182
183 # read in file ($text will be in utf8)
184 my $text = "";
185 $self->read_file ($conv_filename, $encoding, $language, \$text);
186
187 # turn any high bytes that aren't valid utf-8 into utf-8.
188 #unicode::ensure_utf8(\$text);
189
190 # Write it out again!
191 #$self->utf8_write_file (\$text, $conv_filename);
192}
193
194sub get_file_type {
195 my $self = shift (@_);
196 my $file_type = "Word";
197 return $file_type;
198}
199
200# Modified to cache HTML files for efficieny reasons rather
201# than delete all. HTML is modified not to use IE's VML.
202# VML uses WML files, so these can be deleted.
203sub cleanup_tmp_area {
204 my ($self) = @_;
205 if (defined $self->{'files_dir'}) {
206 my $html_files_dir = $self->{'files_dir'};
207
208 if (opendir(DIN,$html_files_dir)) {
209 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
210 foreach my $f (@wmz_files) {
211 my $full_f = &util::filename_cat($html_files_dir,$f);
212 &util::rm($full_f);
213 }
214 closedir(DIN);
215 }
216 else {
217 # if HTML file has no supporting images, then no _files dir made
218 # => do nothing
219 }
220 }
221}
222
223# do plugin specific processing of doc_obj for HTML type
224sub process {
225 my $self = shift (@_);
226 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
227
228 return $self->process_type("doc", $base_dir, $file, $doc_obj);
229}
230
2311;
232
Note: See TracBrowser for help on using the repository browser.