source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 9465

Last change on this file since 9465 was 9465, checked in by kjdon, 19 years ago

ConvertToPlug now accepts the gli arg in read() and passes it in process(), and all the convert plugins now print a processing message for gli in their process() method.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51my $convert_to_list =
52 [ { 'name' => "html",
53 'desc' => "{ConvertToPlug.convert_to.html}" },
54 { 'name' => "text",
55 'desc' => "{ConvertToPlug.convert_to.text}" } ];
56
57my $arguments =
58 [ { 'name' => "convert_to",
59 'desc' => "{ConvertToPlug.convert_to}",
60 'type' => "enum",
61 'reqd' => "yes",
62 'list' => $convert_to_list,
63 'deft' => "html" },
64 { 'name' => "use_strings",
65 'desc' => "{ConvertToPlug.use_strings}",
66 'type' => "flag",
67 'reqd' => "no" } ];
68
69my $options = { 'name' => "ConvertToPlug",
70 'desc' => "{ConvertToPlug.desc}",
71 'abstract' => "yes",
72 'inherits' => "yes",
73 'args' => $arguments };
74
75sub parse_args
76{
77 my $class = shift (@_);
78 my ($args) = @_;
79
80 my $plugin_name = $class;
81 $plugin_name =~ s/\.pm$//;
82
83 my $newargs = {};
84
85 if (!parsargv::parse($args,
86 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented)
87 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented)
88 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
89 q^use_strings^, \$newargs->{'use_strings'},
90 "allow_extra_options")) {
91
92 print STDERR "\nIncorrect options passed to $plugin_name, ";
93 print STDERR "check your collect.cfg configuration file\n";
94 $self->print_txt_usage(""); # Use default resource bundle
95 die "\n";
96 }
97
98 return ($plugin_name, $newargs);
99}
100
101sub new {
102 my $class = shift (@_);
103 if ($class eq "ConvertToPlug" && defined $_[0]) {$class = shift (@_);}
104 my $self;
105 # parsargv::parse might modify the list, so we do this by creating a copy
106 # of the argument list.
107 my @arglist = @_;
108 my ($plugin_name, $args) = $class->parse_args(\@_);
109
110 if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
111 $ENV{'GSDLOS'} =~ /^windows$/i) {
112 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
113 $args->{'generate_format'} = "html";
114 }
115
116 if ($args->{'generate_format'} eq "text")
117 {
118 $self = new TEXTPlug ($class, @arglist);
119 $self->{'convert_to'} = "TEXT";
120 $self->{'convert_to_ext'} = "txt";
121 }
122 else
123 {
124 $self = new HTMLPlug ($class, @arglist);
125 $self->{'convert_to'} = "HTML";
126 $self->{'convert_to_ext'} = "html";
127
128 $self->{'rename_assoc_files'} = 1;
129 $self->{'metadata_fields'} .= ",GENERATOR";
130 }
131
132 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
133 my $option_list = $self->{'option_list'};
134 push( @{$option_list}, $options );
135
136 foreach my $key (keys %$args) {
137 $self->{$key} = $args->{$key};
138 }
139
140 return bless $self, $class;
141}
142
143# we don't need to block anything, so override the one for HTMLPlug
144# files are converted in a temp dir and extra files not passed down the
145# plugin list
146sub get_default_block_exp {
147 my $self = shift (@_);
148
149 return "";
150}
151
152# Go straight to BasPlug and avoid the special case implemented by HTMLPlug
153sub store_block_files {
154 return BasPlug::store_block_files(@_);
155}
156
157# Run conversion utility on the input file.
158#
159# The conversion takes place in a collection specific 'tmp' directory so
160# that we don't accidentally damage the input.
161#
162# The desired output type is indicated by $output_ext. This is usually
163# something like "html" or "word", but can be "best" (or the empty string)
164# to indicate that the conversion utility should do the best it can.
165
166sub tmp_area_convert_file {
167 my $self = shift (@_);
168 my ($output_ext, $input_filename, $textref) = @_;
169
170 my $outhandle = $self->{'outhandle'};
171 my $convert_to = $self->{'convert_to'};
172 my $failhandle = $self->{'failhandle'};
173
174 # softlink to collection tmp dir
175 my $tmp_dirname
176 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
177 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
178
179 # derive tmp filename from input filename
180 my ($tailname, $dirname, $suffix)
181 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
182
183 # Remove any white space from filename -- no risk of name collision, and
184 # makes later conversion by utils simpler. Leave spaces in path...
185 $tailname =~ s/\s+//g;
186
187 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
188
189 &util::soft_link($input_filename, $tmp_filename);
190
191 my $verbosity = $self->{'verbosity'};
192 if ($verbosity > 0) {
193 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
194 }
195
196 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
197
198 # Execute the conversion command and get the type of the result,
199 # making sure the converter gives us the appropriate output type
200 my $output_type = lc($convert_to);
201 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
202 if (defined $self->{'convert_options'}) {
203 $cmd .= $self->{'convert_options'} . " ";
204 }
205 if ($self->{'use_strings'}) {
206 $cmd .= "-use_strings ";
207 }
208 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
209
210 $output_type = `$cmd`;
211
212 # remove symbolic link to original file
213 &util::rm($tmp_filename);
214
215 # Check STDERR here
216 chomp $output_type;
217 if ($output_type eq "fail") {
218 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
219 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
220 $self->{'num_not_processed'} ++;
221 if (-s "$errlog") {
222 open(ERRLOG, "$errlog");
223 while (<ERRLOG>) {
224 print $outhandle "$_";
225 }
226 print $outhandle "\n";
227 close ERRLOG;
228 }
229 &util::rm("$errlog") if (-e "$errlog");
230 return "";
231 }
232
233 # store the *actual* output type and return the output filename
234 # it's possible we requested conversion to html, but only to text succeeded
235
236 $self->{'convert_to_ext'} = $output_type;
237 if ($output_type =~ /html/i) {
238 $self->{'converted_to'} = "HTML";
239 } elsif ($output_type =~ /te?xt/i) {
240 $self->{'converted_to'} = "TEXT";
241 }
242 my $output_filename = $tmp_filename;
243
244 $output_filename =~ s/$suffix$/.$output_type/;
245
246 return $output_filename;
247}
248
249
250# Remove collection specific tmp directory and all its contents.
251
252sub cleanup_tmp_area {
253 my $self = shift (@_);
254
255 my $tmp_dirname
256 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
257 &util::rm_r($tmp_dirname);
258 &util::mk_dir($tmp_dirname);
259}
260
261
262
263
264# Override BasPlug read
265# We don't want to get language encoding stuff until after we've converted
266# our file to either TEXT or HTML.
267sub read {
268 my $self = shift (@_);
269 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
270# if ($self->is_recursive()) {
271# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
272# }
273
274 my $outhandle = $self->{'outhandle'};
275
276 my $filename = $file;
277 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
278
279 if ($self->associate_with($file,$filename,$metadata)) {
280 # a form of smart block
281 $self->{'num_blocked'} ++;
282 return 0; # blocked
283 }
284
285 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
286 $self->{'num_blocked'} ++;
287 return 0;
288 }
289 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
290 return undef;
291 }
292 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
293
294 # read in file ($text will be in utf8)
295 my $text = "";
296
297 my $output_ext = $self->{'convert_to_ext'};
298 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
299
300 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
301 if (! -e "$conv_filename") {return 0;} # allows continue on errors
302 $self->{'conv_filename'} = $conv_filename;
303
304 # Do encoding stuff
305 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
306
307 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
308 if (!length ($text)) {
309 my $plugin_name = ref ($self);
310 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
311 return 0;
312 }
313
314 # if we converted to HTML, convert &eacute; and etc to utf-8.
315 # this should really happen before language_extraction, but that means
316 # modifying a file on disk...
317 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
318
319 # create a new document
320 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
321 # now we use the original filename here
322 my $doc_obj = new doc($filename, "indexed_doc");
323 $doc_obj->set_converted_filename($conv_filename);
324 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
325 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
326 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
327 my ($filemeta) = $file =~ /([^\\\/]+)$/;
328 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
329 if ($self->{'cover_image'}) {
330 $self->associate_cover_image($doc_obj, $filename);
331 }
332 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
333 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
334
335 # include any metadata passed in from previous plugins
336 # note that this metadata is associated with the top level section
337 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
338 # do plugin specific processing of doc_obj
339 return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli ));
340 # do any automatic metadata extraction
341 $self->auto_extract_metadata ($doc_obj);
342 # add an OID
343 $doc_obj->set_OID();
344 # process the document
345 $processor->process($doc_obj);
346 $self->cleanup_tmp_area();
347
348 $self->{'num_processed'} ++;
349
350 return 1;
351}
352
353
354# do plugin specific processing of doc_obj for HTML type
355sub process_type {
356 my $self = shift (@_);
357 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
358
359 my $conv_filename = $self->{'conv_filename'};
360 my $tmp_dirname = File::Basename::dirname($conv_filename);
361 my $tmp_tailname = File::Basename::basename($conv_filename);
362
363 my $converted_to = $self->{'converted_to'};
364 my $ret_val;
365
366 if ($converted_to eq "TEXT")
367 {
368
369 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
370 $tmp_dirname, $tmp_tailname,
371 $metadata, $doc_obj);
372 }
373 else
374 {
375 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
376 $tmp_dirname, $tmp_tailname,
377 $metadata, $doc_obj);
378 }
379
380 # associate original file with doc object
381 my $cursection = $doc_obj->get_top_section();
382 my $filename = &util::filename_cat($base_dir, $file);
383 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
384
385 my $file_type;
386
387 if ($doc_ext eq "doc") {
388 $file_type = "Word";
389 } elsif ($doc_ext eq "xls") {
390 $file_type = "Excel";
391 } elsif ($doc_ext eq "ppt") {
392 $file_type = "PPT";
393 } elsif ($doc_ext eq "pdf") {
394 $file_type = "PDF";
395 } elsif ($doc_ext eq "rtf") {
396 $file_type = "RTF";
397 } elsif ($doc_ext eq "ps") {
398 $file_type = "PS";
399 }
400
401 my $file_format = $file_type || "unknown";
402
403 # We use set instead of add here because we only want one value
404 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
405
406 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
407 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
408 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
409 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
410
411 return $ret_val;
412}
413
4141;
Note: See TracBrowser for help on using the repository browser.