source: trunk/gsdl/perllib/plugins/WordPlug.pm@ 10419

Last change on this file since 10419 was 10405, checked in by chi, 19 years ago

Adding structured HTML formating arguments here.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1###########################################################################
2#
3# WordPlug.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# 12/05/02 Added usage datastructure - John Thompson
26
27package WordPlug;
28
29use ConvertToPlug;
30require StructuredHTMLPlug;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 #@WordPlug::ISA = ('ConvertToPlug','StructuredHTMLPlug');
36 @WordPlug::ISA = ('ConvertToPlug');
37}
38
39my $arguments =
40 [ { 'name' => "process_exp",
41 'desc' => "{BasPlug.process_exp}",
42 'type' => "regexp",
43 'deft' => &get_default_process_exp(),
44 'reqd' => "no" }
45 ];
46
47my $options = { 'name' => "WordPlug",
48 'desc' => "{WordPlug.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments };
52
53sub new {
54 my ($class) = shift (@_);
55 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
56 push(@$pluginlist, $class);
57
58 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
59 my $ws_arg = [ { 'name' => "windows_scripting",
60 'desc' => "{WordPlug.windows_scripting}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "checkout_toc",
64 'desc' => "{WordPlug.checkout_toc}",
65 'type' => "flag",
66 'reqd' => "no"},
67 { 'name' => "level1_header",
68 'desc' => "{WordPlug.level1_header}",
69 'type' => "regexp",
70 'reqd' => "no",
71 'deft' => "" },
72 { 'name' => "level2_header",
73 'desc' => "{WordPlug.level2_header}",
74 'type' => "regexp",
75 'reqd' => "no",
76 'deft' => "" },
77 { 'name' => "level3_header",
78 'desc' => "{WordPlug.level3_header}",
79 'type' => "regexp",
80 'reqd' => "no",
81 'deft' => "" },
82 { 'name' => "title_header",
83 'desc' => "{WordPlug.title_header}",
84 'type' => "regexp",
85 'reqd' => "no",
86 'deft' => "" },
87 { 'name' => "toc_header",
88 'desc' => "{WordPlug.toc_header}",
89 'type' => "regexp",
90 'reqd' => "no",
91 'deft' => "" },
92 { 'name' => "tof_header",
93 'desc' => "{WordPlug.tof_header}",
94 'type' => "regexp",
95 'reqd' => "no",
96 'deft' => "" }];
97
98 push(@$arguments,@$ws_arg);
99 }
100
101 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
102 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
103
104 #foreach my $arg (@{$hashArgOptLists->{"ArgList"}})
105 #{
106 # print STDERR "**** Word Argument =$arg->{'name'}\n";
107 #}
108
109 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
110
111 #this is passed through to gsConvert.pl by ConvertToPlug.pm
112 $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
113
114 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
115 if (defined $self->{'windows_scripting'}) {
116 if (!defined $secondary_plugin_options->{'StructuredHTMLPlug'}){
117 $secondary_plugin_options->{'StructuredHTMLPlug'} =[];
118 my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
119 }
120 }
121 if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
122 $secondary_plugin_options->{'HTMLPlug'} = [];
123 }
124 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
125 $secondary_plugin_options->{'TEXTPlug'} = [];
126 }
127
128 my $html_options = $secondary_plugin_options->{'HTMLPlug'};
129 my $text_options = $secondary_plugin_options->{'TextPlug'};
130
131 # wvWare will always produce html files encoded as utf-8
132 if ($self->{'input_encoding'} eq "auto") {
133 $self->{'input_encoding'} = "utf8";
134 $self->{'extract_language'} = 1;
135 push(@$html_options,"-input_encoding", "utf8");
136 push(@$html_options,"-extract_language");
137
138 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
139 # to extract these metadata fields from the HEAD META fields
140 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
141 #push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
142 }
143
144 $self = bless $self, $class;
145 $self->load_secondary_plugins($class,$secondary_plugin_options);
146
147 return bless $self;
148}
149
150sub get_default_process_exp {
151 my $self = shift (@_);
152
153 return q^(?i)\.(doc|dot)$^;
154}
155
156sub convert_post_process
157{
158 my $self = shift (@_);
159 my ($conv_filename) = @_;
160
161 my $outhandle=$self->{'outhandle'};
162
163 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
164
165 # read in file ($text will be in utf8)
166 my $text = "";
167 $self->read_file ($conv_filename, $encoding, $language, \$text);
168
169 # turn any high bytes that aren't valid utf-8 into utf-8.
170 unicode::ensure_utf8(\$text);
171
172 # Write it out again!
173}
174
175sub get_file_type {
176 my $self = shift (@_);
177 my $file_type = "Word";
178 return $file_type;
179}
180
181# Modified to cache HTML files for efficieny reasons rather
182# than delete all. HTML is modified not to use IE's VML.
183# VML uses WML files, so these can be deleted.
184sub cleanup_tmp_area {
185 my ($self) = @_;
186 if (defined $self->{'files_dir'}) {
187 my $html_files_dir = $self->{'files_dir'};
188
189 if (opendir(DIN,$html_files_dir)) {
190 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
191 foreach my $f (@wmz_files) {
192 my $full_f = &util::filename_cat($html_files_dir,$f);
193 &util::rm($full_f);
194 }
195 closedir(DIN);
196 }
197 else {
198 # if HTML file has no supporting images, then no _files dir made
199 # => do nothing
200 }
201 }
202}
203
204# do plugin specific processing of doc_obj for HTML type
205sub process {
206 my $self = shift (@_);
207 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
208
209 return $self->process_type("doc", $base_dir, $file, $doc_obj);
210}
211
2121;
213
Note: See TracBrowser for help on using the repository browser.