source: trunk/gsdl/perllib/plugins/WordPlug.pm@ 10429

Last change on this file since 10429 was 10428, checked in by chi, 19 years ago

Modification of the way passing argument and option lists for the secondary plugin. Also, add an
option (extracted_word_metadata) to retrieve metadata based on user-defined fields from the html document
converted by VB scripting.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# WordPlug.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# 12/05/02 Added usage datastructure - John Thompson
26
27package WordPlug;
28
29use ConvertToPlug;
30require StructuredHTMLPlug;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 #@WordPlug::ISA = ('ConvertToPlug','StructuredHTMLPlug');
36 @WordPlug::ISA = ('ConvertToPlug');
37}
38
39my $arguments =
40 [ { 'name' => "process_exp",
41 'desc' => "{BasPlug.process_exp}",
42 'type' => "regexp",
43 'deft' => &get_default_process_exp(),
44 'reqd' => "no" }
45 ];
46
47my $options = { 'name' => "WordPlug",
48 'desc' => "{WordPlug.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59 my $ws_arg = [ { 'name' => "windows_scripting",
60 'desc' => "{WordPlug.windows_scripting}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "checkout_toc",
64 'desc' => "{WordPlug.checkout_toc}",
65 'type' => "flag",
66 'reqd' => "no"},
67 { 'name' => "level1_header",
68 'desc' => "{WordPlug.level1_header}",
69 'type' => "regexp",
70 'reqd' => "no",
71 'deft' => "" },
72 { 'name' => "level2_header",
73 'desc' => "{WordPlug.level2_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" },
77 { 'name' => "level3_header",
78 'desc' => "{WordPlug.level3_header}",
79 'type' => "regexp",
80 'reqd' => "no",
81 'deft' => "" },
82 { 'name' => "title_header",
83 'desc' => "{WordPlug.title_header}",
84 'type' => "regexp",
85 'reqd' => "no",
86 'deft' => "" },
87 { 'name' => "toc_header",
88 'desc' => "{WordPlug.toc_header}",
89 'type' => "regexp",
90 'reqd' => "no",
91 'deft' => "" },
92 { 'name' => "tof_header",
93 'desc' => "{WordPlug.tof_header}",
94 'type' => "regexp",
95 'reqd' => "no",
96 'deft' => "" },
97 { 'name' => "extracted_word_metadata_fields",
98 'type' => "string",
99 'deft' => "Title"}];
100
101 push(@$arguments,@$ws_arg);
102 }
103
104 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
105 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
106 my @arg_array = @$inputargs;
107 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
108
109 #this is passed through to gsConvert.pl by ConvertToPlug.pm
110 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
111
112 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
113 if (defined $self->{'windows_scripting'}) {
114 if (!defined $secondary_plugin_options->{'StructuredHTMLPlug'}){
115 $secondary_plugin_options->{'StructuredHTMLPlug'} = [];
116 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
117 if ($self->{'input_encoding'} eq "auto") {
118 $self->{'input_encoding'} = "utf8";
119 $self->{'extract_language'} = 1;
120 push(@$structhtml_options,"-input_encoding", "utf8");
121 push(@$structhtml_options,"-extract_language");
122
123 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
124 # to extract these metadata fields from the HEAD META fields
125 push(@$structhtml_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
126 push(@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
127 }
128 }
129 }
130 if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
131 $secondary_plugin_options->{'HTMLPlug'} = [];
132 }
133 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
134 $secondary_plugin_options->{'TEXTPlug'} = [];
135 }
136
137 my $html_options = $secondary_plugin_options->{'HTMLPlug'};
138 my $text_options = $secondary_plugin_options->{'TextPlug'};
139
140 # wvWare will always produce html files encoded as utf-8
141 if ($self->{'input_encoding'} eq "auto") {
142 $self->{'input_encoding'} = "utf8";
143 $self->{'extract_language'} = 1;
144 push(@$html_options,"-input_encoding", "utf8");
145 push(@$html_options,"-extract_language");
146
147 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
148 # to extract these metadata fields from the HEAD META fields
149 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
150 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
151 }
152
153 $self = bless $self, $class;
154 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
155
156 return bless $self;
157}
158
159sub get_default_process_exp {
160 my $self = shift (@_);
161
162 return q^(?i)\.(doc|dot)$^;
163}
164
165sub convert_post_process
166{
167 my $self = shift (@_);
168 my ($conv_filename) = @_;
169
170 my $outhandle=$self->{'outhandle'};
171
172 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
173
174 # read in file ($text will be in utf8)
175 my $text = "";
176 $self->read_file ($conv_filename, $encoding, $language, \$text);
177
178 # turn any high bytes that aren't valid utf-8 into utf-8.
179 unicode::ensure_utf8(\$text);
180
181 # Write it out again!
182 $self->utf8_write_file (\$text, $conv_filename);
183}
184
185sub get_file_type {
186 my $self = shift (@_);
187 my $file_type = "Word";
188 return $file_type;
189}
190
191# Modified to cache HTML files for efficieny reasons rather
192# than delete all. HTML is modified not to use IE's VML.
193# VML uses WML files, so these can be deleted.
194sub cleanup_tmp_area {
195 my ($self) = @_;
196 if (defined $self->{'files_dir'}) {
197 my $html_files_dir = $self->{'files_dir'};
198
199 if (opendir(DIN,$html_files_dir)) {
200 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
201 foreach my $f (@wmz_files) {
202 my $full_f = &util::filename_cat($html_files_dir,$f);
203 &util::rm($full_f);
204 }
205 closedir(DIN);
206 }
207 else {
208 # if HTML file has no supporting images, then no _files dir made
209 # => do nothing
210 }
211 }
212}
213
214# do plugin specific processing of doc_obj for HTML type
215sub process {
216 my $self = shift (@_);
217 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
218
219 return $self->process_type("doc", $base_dir, $file, $doc_obj);
220}
221
2221;
223
Note: See TracBrowser for help on using the repository browser.