root/main/trunk/greenstone2/perllib/plugins/WordPlugin.pm @ 22705

Revision 22705, 8.9 KB (checked in by davidb, 9 years ago)

User of AutoloadConverterScripting? expanded to encompass PowerPoint? and Excel. No longer need OOConvertBinaryFile and PBConvertBinaryFile

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoloadConverterScripting;
34
35@WordPlugin::ISA = ('AutoloadConverterScripting');
36
37
38my $arguments =
39    [ { 'name' => "process_exp",
40    'desc' => "{BasePlugin.process_exp}",
41    'type' => "regexp",
42    'deft' => &get_default_process_exp(),
43    'reqd' => "no" },
44      { 'name' => "description_tags",
45    'desc' => "{HTMLPlugin.description_tags}",
46    'type' => "flag" }
47      ];
48
49
50my $opt_windows_args = [ { 'name' => "windows_scripting",
51               'desc' => "{WordPlugin.windows_scripting}",
52               'type' => "flag",
53               'reqd' => "no" } ];
54
55my $opt_office_args = [ { 'name' => "metadata_fields",
56              'desc' => "{WordPlugin.metadata_fields}",
57              'type' => "string",
58              'deft' => "Title" },
59            { 'name' => "level1_header",
60              'desc' => "{StructuredHTMLPlugin.level1_header}",
61              'type' => "regexp",
62              'reqd' => "no",
63              'deft' => "" },
64            { 'name' => "level2_header",
65              'desc' => "{StructuredHTMLPlugin.level2_header}",
66              'type' => "regexp",
67              'reqd' => "no",
68              'deft' => "" },
69            { 'name' => "level3_header",
70              'desc' => "{StructuredHTMLPlugin.level3_header}",
71              'type' => "regexp",
72              'reqd' => "no",
73              'deft' => "" },
74            { 'name' => "title_header",
75              'desc' => "{StructuredHTMLPlugin.title_header}",
76              'type' => "regexp",
77              'reqd' => "no",
78              'deft' => "" },
79            { 'name' => "delete_toc",
80              'desc' => "{StructuredHTMLPlugin.delete_toc}",
81              'type' => "flag",
82              'reqd' => "no" },
83            { 'name' => "toc_header",
84              'desc' => "{StructuredHTMLPlugin.toc_header}",
85              'type' => "regexp",
86              'reqd' => "no",
87              'deft' => "" } ];
88
89
90my $options = { 'name'     => "WordPlugin",
91        'desc'     => "{WordPlugin.desc}",
92        'abstract' => "no",
93        'inherits' => "yes",
94        'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
95        'args'     => $arguments };
96
97sub new {
98    my ($class) = shift (@_);
99    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100    push(@$pluginlist, $class);
101
102    my $office_capable = 0;
103    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
104    push(@$arguments,@$opt_windows_args);
105    $office_capable = 1;
106    }
107    if ($AutoloadConverterScripting::openoffice_ext_working) {
108    $office_capable = 1;
109    }
110    # these office args apply to windows scripting or to openoffice scripting
111    if ($office_capable) {
112    push(@$arguments,@$opt_office_args);
113    }
114   
115    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
116    push(@{$hashArgOptLists->{"OptList"}},$options);
117
118    my $self = new AutoloadConverterScripting("OpenOfficeConverter",$pluginlist, $inputargs, $hashArgOptLists);
119
120    if ($self->{'info_only'}) {
121    # don't worry about any options etc
122    return bless $self, $class;
123    }
124
125    $self->{'filename_extension'} = "doc";
126    $self->{'file_type'} = "Word";
127
128    my $outhandle = $self->{'outhandle'};
129
130    if ($self->{'windows_scripting'}) {
131    $self->{'convert_options'} = "-windows_scripting";
132    $self->{'office_scripting'} = 1;
133    }   
134    if ($self->{'openoffice_scripting'}) {
135    if ($self->{'windows_scripting'}) {
136        print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_scripting\n";
137        print $outhandle "         on at the same time.  Defaulting to -windows_scripting\n";
138        $self->{'openoffice_scripting'} = 0;
139    }
140    else {
141        $self->{'office_scripting'} = 1;
142    }
143    }
144
145    # check convert_to
146    if ($self->{'convert_to'} eq "auto") {
147    $self->{'convert_to'} = "html";
148    }
149    # windows or open office scripting, outputs structuredHTML
150    if (defined $self->{'office_scripting'}) {
151    $self->{'convert_to'} = "structuredhtml";
152    }
153
154    # set convert_to_plugin and convert_to_ext
155    $self->set_standard_convert_settings();
156 
157    my $secondary_plugin_name = $self->{'convert_to_plugin'};
158    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
159
160    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
161    $secondary_plugin_options->{$secondary_plugin_name} = [];
162    }
163    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
164
165    # following title_sub removes "Page 1" and a leading
166    # "1", which is often the page number at the top of the page. Bad Luck
167    # if your document title actually starts with "1 " - is there a better way?
168    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
169
170    my $associate_tail_re = $self->{'associate_tail_re'};
171    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
172    push(@$specific_options, "-associate_tail_re", $associate_tail_re);
173    }
174    push(@$specific_options, "-file_rename_method", "none");
175
176    if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
177    # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
178    # to extract these metadata fields from the HEAD META fields
179    push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
180    push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
181    push (@$specific_options, "-extract_language") if $self->{'extract_language'};
182    push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
183    push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
184    push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
185    push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
186    push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
187    push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
188    push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
189    push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
190    push(@$specific_options, "-processing_tmp_files");
191   
192    }
193   
194    elsif ($secondary_plugin_name eq "HTMLPlugin") {
195    push(@$specific_options, "-processing_tmp_files");
196    push(@$specific_options,"-input_encoding", "utf8");
197    push(@$specific_options,"-extract_language") if $self->{'extract_language'};
198    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
199    # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
200    # to extract these metadata fields from the HEAD META fields
201    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
202    }
203
204    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
205
206    return bless $self, $class;
207}
208
209sub get_default_process_exp {
210    my $self = shift (@_);
211    if ($AutoloadConverterScripting::openoffice_ext_working) {
212    return q^(?i)\.(doc|dot|docx|odt)$^;
213    }
214    return q^(?i)\.(doc|dot)$^;
215}
216
217sub convert_post_process_old
218{
219    my $self = shift (@_);
220    my ($conv_filename) = @_;
221
222    my $outhandle=$self->{'outhandle'};
223     
224    my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
225
226    # read in file ($text will be in utf8)
227    my $text = "";
228    $self->read_file ($conv_filename, $encoding, $language, \$text);
229
230    # turn any high bytes that aren't valid utf-8 into utf-8.
231    #unicode::ensure_utf8(\$text);
232   
233    # Write it out again!
234    #$self->utf8_write_file (\$text, $conv_filename);
235}
236
237# Modified to cache HTML files for efficieny reasons rather
238# than delete all.  HTML is modified not to use IE's VML.
239# VML uses WML files, so these can be deleted.
240sub cleanup_tmp_area {
241    my ($self) = @_;
242    if (defined $self->{'files_dir'}) {
243    my $html_files_dir = $self->{'files_dir'};
244
245    if (opendir(DIN,$html_files_dir)) {
246        my @wmz_files = grep( /\.wmz$/, readdir(DIN));
247        foreach my $f (@wmz_files) {
248        my $full_f = &util::filename_cat($html_files_dir,$f);
249        &util::rm($full_f);
250        }
251        closedir(DIN);
252    }
253    else {
254        # if HTML file has no supporting images, then no _files dir made
255        # => do nothing
256    }
257    }
258}
259
260
2611;
262
Note: See TracBrowser for help on using the browser.