source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 4785

Last change on this file since 4785 was 4745, checked in by mdewsnip, 21 years ago

Uncommented a line which shouldn't have been committed commented.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51my $convert_to_list =
52 [ { 'name' => "html",
53 'desc' => "HTML format" },
54 { 'name' => "text",
55 'desc' => "Plain text format" } ];
56
57my $arguments =
58 [ { 'name' => "convert_to",
59 'desc' => "Plugin converts to TEXT or HTML.",
60 'type' => "enum",
61 'reqd' => "no",
62 'list' => $convert_to_list,
63 'deft' => "html" },
64 { 'name' => "use_strings",
65 'desc' => "If set, a simple strings function will be called to extract text if the conversion utility fails.",
66 'type' => "flag",
67 'reqd' => "no" } ];
68
69my $options = { 'name' => "ConvertToPlug",
70 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.",
71 'inherits' => "Yes",
72 'args' => $arguments };
73
74
75sub print_usage {
76 my ($plugin_name) = @_;
77
78 # for when this function is called directly by pluginfo.pl
79 if (ref ($plugin_name)) {
80 $plugin_name = ref ($plugin_name);
81 }
82
83 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
84 print STDERR " options:\n";
85 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n";
86 print STDERR " (default html)\n";
87 print STDERR " -use_strings if set a simple strings function\n";
88 print STDERR " will be called to extract text\n";
89 print STDERR " if the conversion utility fails\n";
90}
91
92sub parse_args
93{
94 my $class = shift (@_);
95 my ($args) = @_;
96
97 my $plugin_name = $class;
98 $plugin_name =~ s/\.pm$//;
99
100 my $newargs = {};
101
102 if (!parsargv::parse($args,
103 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented)
104 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented)
105 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
106 q^use_strings^, \$newargs->{'use_strings'},
107 "allow_extra_options")) {
108
109 print STDERR "\nIncorrect options passed to $plugin_name, ";
110 print STDERR "check your collect.cfg configuration file\n";
111 &print_usage($plugin_name);
112 die "\n";
113 }
114
115 return ($plugin_name, $newargs);
116}
117
118sub new {
119 my $class = shift (@_);
120 if ($class eq "ConvertToPlug") {$class = shift (@_);}
121 my $self;
122 # parsargv::parse might modify the list, so we do this by creating a copy
123 # of the argument list.
124 my @arglist = @_;
125 my ($plugin_name, $args) = $class->parse_args(\@_);
126
127 if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
128 $ENV{'GSDLOS'} =~ /^windows$/i) {
129 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
130 $args->{'generate_format'} = "html";
131 }
132
133 if ($args->{'generate_format'} eq "text")
134 {
135 $self = new TEXTPlug ($class, @arglist);
136 $self->{'convert_to'} = "TEXT";
137 $self->{'convert_to_ext'} = "txt";
138 }
139 else
140 {
141 $self = new HTMLPlug ($class, @arglist);
142 $self->{'convert_to'} = "HTML";
143 $self->{'convert_to_ext'} = "html";
144
145 $self->{'rename_assoc_files'} = 1;
146 $self->{'metadata_fields'} .= ",GENERATOR";
147 }
148
149 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
150 my $option_list = $self->{'option_list'};
151 push( @{$option_list}, $options );
152
153 foreach my $key (keys %$args) {
154 $self->{$key} = $args->{$key};
155 }
156
157 return bless $self, $class;
158}
159
160
161
162# Run conversion utility on the input file.
163#
164# The conversion takes place in a collection specific 'tmp' directory so
165# that we don't accidentally damage the input.
166#
167# The desired output type is indicated by $output_ext. This is usually
168# something like "html" or "word", but can be "best" (or the empty string)
169# to indicate that the conversion utility should do the best it can.
170
171sub tmp_area_convert_file {
172 my $self = shift (@_);
173 my ($output_ext, $input_filename, $textref) = @_;
174
175 my $outhandle = $self->{'outhandle'};
176 my $convert_to = $self->{'convert_to'};
177 my $failhandle = $self->{'failhandle'};
178
179 # softlink to collection tmp dir
180 my $tmp_dirname
181 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
182 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
183
184 # derive tmp filename from input filename
185 my ($tailname, $dirname, $suffix)
186 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
187
188 # Remove any white space from filename -- no risk of name collision, and
189 # makes later conversion by utils simpler. Leave spaces in path...
190 $tailname =~ s/\s+//g;
191
192 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
193
194 &util::soft_link($input_filename, $tmp_filename);
195
196 my $verbosity = $self->{'verbosity'};
197 if ($verbosity > 0) {
198 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
199 }
200
201 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
202
203 # Execute the conversion command and get the type of the result,
204 # making sure the converter gives us the appropriate output type
205 my $output_type = lc($convert_to);
206 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
207 if (defined $self->{'convert_options'}) {
208 $cmd .= $self->{'convert_options'} . " ";
209 }
210 if ($self->{'use_strings'}) {
211 $cmd .= "-use_strings ";
212 }
213 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
214
215 $output_type = `$cmd`;
216
217 # remove symbolic link to original file
218 &util::rm($tmp_filename);
219
220 # Check STDERR here
221 chomp $output_type;
222 if ($output_type eq "fail") {
223 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
224 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
225 $self->{'num_not_processed'} ++;
226 if (-s "$errlog") {
227 open(ERRLOG, "$errlog");
228 while (<ERRLOG>) {
229 print $outhandle "$_";
230 }
231 print $outhandle "\n";
232 close ERRLOG;
233 }
234 &util::rm("$errlog") if (-e "$errlog");
235 return "";
236 }
237
238 # store the *actual* output type and return the output filename
239 # it's possible we requested conversion to html, but only to text succeeded
240
241 $self->{'convert_to_ext'} = $output_type;
242 if ($output_type =~ /html/i) {
243 $self->{'converted_to'} = "HTML";
244 } elsif ($output_type =~ /te?xt/i) {
245 $self->{'converted_to'} = "TEXT";
246 }
247 my $output_filename = $tmp_filename;
248
249 $output_filename =~ s/$suffix$/.$output_type/;
250
251 return $output_filename;
252}
253
254
255# Remove collection specific tmp directory and all its contents.
256
257sub cleanup_tmp_area {
258 my $self = shift (@_);
259
260 my $tmp_dirname
261 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
262 &util::rm_r($tmp_dirname);
263 &util::mk_dir($tmp_dirname);
264}
265
266
267
268
269# Override BasPlug read
270# We don't want to get language encoding stuff until after we've converted
271# our file to either TEXT or HTML.
272sub read {
273 my $self = shift (@_);
274 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
275# if ($self->is_recursive()) {
276# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
277# }
278
279 my $outhandle = $self->{'outhandle'};
280
281 my $filename = $file;
282 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
283
284 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
285 $self->{'num_blocked'} ++;
286 return 0;
287 }
288 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
289 return undef;
290 }
291 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
292
293 # read in file ($text will be in utf8)
294 my $text = "";
295
296 my $output_ext = $self->{'convert_to_ext'};
297 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
298
299 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
300 if (! -e "$conv_filename") {return 0;} # allows continue on errors
301 $self->{'conv_filename'} = $conv_filename;
302
303 # Do encoding stuff
304 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
305
306 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
307 if (!length ($text)) {
308 my $plugin_name = ref ($self);
309 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
310 return 0;
311 }
312
313 # if we converted to HTML, convert &eacute; and etc to utf-8.
314 # this should really happen before language_extraction, but that means
315 # modifying a file on disk...
316 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
317
318 # create a new document
319 my $doc_obj = new doc ($conv_filename, "indexed_doc");
320 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
321 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
322 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
323 my ($filemeta) = $file =~ /([^\\\/]+)$/;
324 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
325 if ($self->{'cover_image'}) {
326 $self->associate_cover_image($doc_obj, $filename);
327 }
328
329 # include any metadata passed in from previous plugins
330 # note that this metadata is associated with the top level section
331 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
332 # do plugin specific processing of doc_obj
333 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
334 # do any automatic metadata extraction
335 $self->auto_extract_metadata ($doc_obj);
336 # add an OID
337 $doc_obj->set_OID();
338 # process the document
339 $processor->process($doc_obj);
340 $self->cleanup_tmp_area();
341
342 $self->{'num_processed'} ++;
343
344 return 1;
345}
346
347
348# do plugin specific processing of doc_obj for HTML type
349sub process_type {
350 my $self = shift (@_);
351 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
352
353 my $conv_filename = $self->{'conv_filename'};
354 my $tmp_dirname = File::Basename::dirname($conv_filename);
355 my $tmp_tailname = File::Basename::basename($conv_filename);
356
357 my $converted_to = $self->{'converted_to'};
358 my $ret_val;
359
360 if ($converted_to eq "TEXT")
361 {
362
363 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
364 $tmp_dirname, $tmp_tailname,
365 $metadata, $doc_obj);
366 }
367 else
368 {
369 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
370 $tmp_dirname, $tmp_tailname,
371 $metadata, $doc_obj);
372 }
373
374 # associate original file with doc object
375 my $cursection = $doc_obj->get_top_section();
376 my $filename = &util::filename_cat($base_dir, $file);
377 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
378
379 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
380 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
381 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
382 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
383 return $ret_val;
384}
385
3861;
Note: See TracBrowser for help on using the repository browser.