source: main/trunk/greenstone2/perllib/plugins/WordPlugin.pm

Last change on this file was 38739, checked in by davidb, 2 months ago

Accidentally committed debug statements. Now removed

  • Property svn:keywords set to Author Date Id Revision
File size: 13.7 KB
Line 
1###########################################################################
2#
3# WordPlugin.pm -- plugin for importing Microsoft Word documents
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package WordPlugin;
26
27use strict;
28no strict 'refs'; # allow filehandles to be variables and viceversa
29no strict 'subs';
30
31use gsprintf 'gsprintf';
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36sub BEGIN {
37 @WordPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters');
38}
39
40my $openoffice_available = 0;
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BaseImporter.process_exp}",
45 'type' => "regexp",
46 'deft' => "&get_default_process_exp()", # delayed (see below)
47 'reqd' => "no" },
48 { 'name' => "description_tags",
49 'desc' => "{HTMLPlugin.description_tags}",
50 'type' => "flag" }
51 ];
52
53
54my $opt_windows_args = [ { 'name' => "windows_scripting",
55 'desc' => "{WordPlugin.windows_scripting}",
56 'type' => "flag",
57
58 'reqd' => "no" } ];
59
60my $opt_office_args = [ { 'name' => "metadata_fields",
61 'desc' => "{WordPlugin.metadata_fields}",
62 'type' => "string",
63 'deft' => "Title" },
64 { 'name' => "level1_header",
65 'desc' => "{StructuredHTMLPlugin.level1_header}",
66 'type' => "regexp",
67 'reqd' => "no",
68 'deft' => "" },
69 { 'name' => "level2_header",
70 'desc' => "{StructuredHTMLPlugin.level2_header}",
71 'type' => "regexp",
72 'reqd' => "no",
73 'deft' => "" },
74 { 'name' => "level3_header",
75 'desc' => "{StructuredHTMLPlugin.level3_header}",
76 'type' => "regexp",
77 'reqd' => "no",
78 'deft' => "" },
79 { 'name' => "title_header",
80 'desc' => "{StructuredHTMLPlugin.title_header}",
81 'type' => "regexp",
82 'reqd' => "no",
83 'deft' => "" },
84 { 'name' => "delete_toc",
85 'desc' => "{StructuredHTMLPlugin.delete_toc}",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "toc_header",
89 'desc' => "{StructuredHTMLPlugin.toc_header}",
90 'type' => "regexp",
91 'reqd' => "no",
92 'deft' => "" } ];
93
94my $opt_soffice_args = [ { 'name' => "generate_pdf_as_associated_file",
95 'desc' => "{WordPlugin.generate_pdf_as_associated_file}",
96 'type' => "flag",
97 'reqd' => "no"
98 } ];
99
100
101my $options = { 'name' => "WordPlugin",
102 'desc' => "{WordPlugin.desc}",
103 'abstract' => "no",
104 'inherits' => "yes",
105 'srcreplaceable' => "yes", # Source docs in Word can be replaced with GS-generated html
106 'args' => $arguments };
107
108sub new {
109 my ($class) = shift (@_);
110 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
111 push(@$pluginlist, $class);
112
113 # this bit needs to happen later after the arguments array has been
114 # finished - used for parsing the input args.
115 # push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
116 # this one needs to go in first, to get the print info in the right order
117 push(@{$hashArgOptLists->{"OptList"}},$options);
118
119 my $office_capable = 0;
120 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
121 push(@$arguments,@$opt_windows_args);
122 $office_capable = 1;
123 }
124
125 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["OpenOfficeConverter"],1);
126
127 if ($auto_converter_self->{'openoffice_available'}) {
128 $office_capable = 1;
129 $openoffice_available = 1;
130 }
131
132 # these office args apply to windows scripting or to openoffice conversion
133 if ($office_capable) {
134 push(@$arguments,@$opt_office_args);
135 }
136
137 # Both OpenOffice and LibreOffice now have much better support for using the
138 # provided 'soffice' CLI directly convert Word doc/docx files to other
139 # formats, such as PDF
140 if ($openoffice_available) {
141 push(@$arguments,@$opt_soffice_args);
142 }
143
144 # evaluate the default for process_exp - it needs to be delayed till here so we know if openoffice is available or not. But needs to be done before parsing the args.
145 foreach my $a (@$arguments) {
146 if ($a->{'name'} eq "process_exp") {
147 my $eval_expr = $a->{'deft'};
148 if ($eval_expr =~ m/^&/) {
149 $a->{'deft'} = eval "$eval_expr";
150 }
151 last;
152 }
153 }
154
155 # have finished modifying our arguments, add them to ArgList
156 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
157
158 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
159 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
160
161 if ($self->{'info_only'}) {
162 # don't worry about any options etc
163 return bless $self, $class;
164 }
165
166 $self = bless $self, $class;
167 $self->{'file_type'} = "Word";
168
169 my $outhandle = $self->{'outhandle'};
170
171 if ($self->{'windows_scripting'}) {
172 $self->{'convert_options'} = "-windows_scripting";
173 $self->{'office_scripting'} = 1;
174 }
175 if ($self->{'openoffice_conversion'}) {
176 if ($self->{'windows_scripting'}) {
177 print $outhandle "Warning: Cannot have -windows_scripting and -openoffice_conversion\n";
178 print $outhandle " on at the same time. Defaulting to -windows_scripting\n";
179 $self->{'openoffice_conversion'} = 0;
180 }
181 else {
182 $self->{'office_scripting'} = 1;
183 }
184 }
185
186 # check convert_to
187 if ($self->{'convert_to'} eq "auto") {
188 $self->{'convert_to'} = "html";
189 }
190 # windows or open office scripting, outputs structuredHTML
191 if (defined $self->{'office_scripting'}) {
192 $self->{'convert_to'} = "structuredhtml";
193 }
194
195 # set convert_to_plugin and convert_to_ext
196 $self->set_standard_convert_settings();
197
198 my $secondary_plugin_name = $self->{'convert_to_plugin'};
199 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
200
201 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
202 $secondary_plugin_options->{$secondary_plugin_name} = [];
203 }
204 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
205
206 # following title_sub removes "Page 1" and a leading
207 # "1", which is often the page number at the top of the page. Bad Luck
208 # if your document title actually starts with "1 " - is there a better way?
209 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
210
211 my $associate_tail_re = $self->{'associate_tail_re'};
212 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
213 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
214 }
215 push(@$specific_options, "-file_rename_method", "none");
216
217 if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
218 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
219 # to extract these metadata fields from the HEAD META fields
220 push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
221 push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
222 push (@$specific_options, "-extract_language") if $self->{'extract_language'};
223 push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
224 push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
225 push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
226 push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
227 push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
228 push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
229 push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
230 push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
231 push(@$specific_options, "-processing_tmp_files");
232
233 }
234
235 elsif ($secondary_plugin_name eq "HTMLPlugin") {
236 push(@$specific_options, "-processing_tmp_files");
237 push(@$specific_options,"-input_encoding", "utf8");
238 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
239 push(@$specific_options, "-description_tags") if $self->{'description_tags'};
240 # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
241 # to extract these metadata fields from the HEAD META fields
242 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
243 }
244
245 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
246
247 return $self;
248}
249
250sub get_default_process_exp {
251 my $self = shift (@_);
252
253 if ($openoffice_available) {
254 return q^(?i)\.(doc|dot|docx|odt|wpd)$^;
255 } elsif ($ENV{'GSDLOS'} =~ m/^windows$/i) {
256 # if OS is windows, can try using docx2html vbs script to see if they have Word 2007
257 # if the user turns windows_scripting on
258 return q^(?i)\.(docx?|dot)$^;
259 }
260 return q^(?i)\.(doc|dot)$^;
261}
262
263sub init {
264 my $self = shift (@_);
265
266 # ConvertBinaryFile init
267 $self->SUPER::init(@_);
268 $self->AutoLoadConverters::init(@_);
269
270}
271
272sub begin {
273 my $self = shift (@_);
274
275 $self->AutoLoadConverters::begin(@_);
276 $self->SUPER::begin(@_);
277
278}
279
280sub deinit {
281 my $self = shift (@_);
282
283 $self->AutoLoadConverters::deinit(@_);
284 $self->SUPER::deinit(@_);
285
286}
287
288sub tmp_area_convert_file {
289
290 my $self = shift (@_);
291 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
292}
293
294
295sub convert_post_process_old
296{
297 my $self = shift (@_);
298 my ($conv_filename) = @_;
299
300 my $outhandle=$self->{'outhandle'};
301
302 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
303
304 # read in file ($text will be in utf8)
305 my $text = "";
306 $self->read_file ($conv_filename, $encoding, $language, \$text);
307
308 # turn any high bytes that aren't valid utf-8 into utf-8.
309 #unicode::ensure_utf8(\$text);
310
311 # Write it out again!
312 #$self->utf8_write_file (\$text, $conv_filename);
313}
314
315# Modified to cache HTML files for efficieny reasons rather
316# than delete all. HTML is modified not to use IE's VML.
317# VML uses WML files, so these can be deleted.
318sub cleanup_tmp_area {
319 my ($self) = @_;
320 if (defined $self->{'files_dir'}) {
321 my $html_files_dir = $self->{'files_dir'};
322
323 if (opendir(DIN,$html_files_dir)) {
324 my @wmz_files = grep( /\.wmz$/, readdir(DIN));
325 foreach my $f (@wmz_files) {
326 my $full_f = &FileUtils::filenameConcatenate($html_files_dir,$f);
327 &FileUtils::removeFiles($full_f);
328 }
329 closedir(DIN);
330 }
331 else {
332 # if HTML file has no supporting images, then no _files dir made
333 # => do nothing
334 }
335 }
336}
337
338
339# do plugin specific processing of doc_obj for doc_ext type
340sub process_type {
341 my $self = shift (@_);
342 my ($base_dir, $file, $doc_obj) = @_;
343
344 my $pt_return_val = $self->SUPER::process_type($base_dir,$file,$doc_obj);
345
346 # Check if 'generate_pdf_as_associated_file' is set ...
347 if ($self->{'generate_pdf_as_associated_file'}) {
348 # => If it is, then set about generating a PDF version
349 # of the file, based on the doc/docx file and
350 # add in the generated PDF as an associated file
351
352 my $source_file_path = &FileUtils::filenameConcatenate($base_dir, $file);
353
354 ####
355 # The following is losely based on ImageConverter::convert()
356 #
357 # A sign that more refactoring could be done in this
358 # area of the code??
359 ####
360
361 my $source_file_no_path = &File::Basename::basename($source_file_path);
362
363 # Determine the full path name of the output file, 'target_file_path'
364 #
365 # Note: given how 'soffice' works with --outdir, we also need to work out
366 # 'target_file_dir', which is effectively dirname of 'target_file_path'
367 #
368 my $target_file_path; # the output file to generate (needed for caching checks)
369 my $target_file_dir; # the output directory to use (needed by soffice)
370
371 if ($self->{'enable_cache'}) {
372 my $cached_file_dir = $self->{'cached_dir'};
373 my $file_root = $self->{'cached_file_root'};
374 my $ofile = "$file_root.pdf";
375
376 $target_file_dir = $cached_file_dir;
377 $target_file_path = &FileUtils::filenameConcatenate($target_file_dir,$ofile);
378 }
379 else {
380 my $ofile = $source_file_no_path;
381 $ofile =~ s/\.\w+$/.pdf/;
382
383 $target_file_dir = &util::determine_tmp_dir();
384 $target_file_path = &FileUtils::filenameConcatenate($target_file_dir,$ofile);
385 }
386
387 # Generate and run the convert command
388 # !!!!! --convert-to and --output-dir
389 # libreoffice --headless --convert-to pdf MyWordFile.docx --outdir ./
390
391 my $convert_command = "soffice --headless --convert-to pdf --outdir \"$target_file_dir\" \"$source_file_path\"";
392
393 my $print_info = { 'message_prefix' => "Converting Doc to PDF",
394 'message' => "Converting Word Document $source_file_path to $target_file_dir" };
395
396 my ($regenerated,$result,$had_error)
397 = $self->autorun_general_cmd($convert_command,$source_file_path,$target_file_path,$print_info);
398
399 if ($had_error) {
400 my $outhandle = $self->{'outhandle'};
401
402 print $outhandle "Warning: failed to convert Word Document to PDF\n";
403 print $outhandle "$result\n";
404 }
405 else {
406 my $cursection = $doc_obj->get_top_section();
407 $doc_obj->associate_file($target_file_path, "doc.pdf", undef, $cursection);
408
409 if (!$self->{'enable_cache'}) {
410 # Safe to now remove the tmp file
411 &FileUtils::removeFiles($target_file_path);
412 }
413 }
414 }
415
416 return $pt_return_val;
417}
418
419
4201;
421
Note: See TracBrowser for help on using the repository browser.