source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 4744

Last change on this file since 4744 was 4744, checked in by mdewsnip, 21 years ago

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
RevLine 
[1410]1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
[1446]40use BasPlug;
[1410]41use HTMLPlug;
42use TEXTPlug;
[2751]43use ghtml;
[1410]44
45sub BEGIN {
[1446]46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
[1410]49}
50
[3540]51my $convert_to_list =
[4744]52 [ { 'name' => "html",
53 'desc' => "HTML format" },
54 { 'name' => "text",
55 'desc' => "Plain text format" } ];
[3540]56
57my $arguments =
[4744]58 [ { 'name' => "convert_to",
59 'desc' => "Plugin converts to TEXT or HTML.",
[3540]60 'type' => "enum",
61 'reqd' => "no",
62 'list' => $convert_to_list,
[4744]63 'deft' => "html" },
64 { 'name' => "use_strings",
65 'desc' => "If set, a simple strings function will be called to extract text if the conversion utility fails.",
66 'type' => "flag",
67 'reqd' => "no" } ];
[3540]68
[4744]69my $options = { 'name' => "ConvertToPlug",
70 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.",
71 'inherits' => "Yes",
72 'args' => $arguments };
[3540]73
74
[1410]75sub print_usage {
76 my ($plugin_name) = @_;
[1741]77
78 # for when this function is called directly by pluginfo.pl
79 if (ref ($plugin_name)) {
80 $plugin_name = ref ($plugin_name);
81 }
[1410]82
83 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
84 print STDERR " options:\n";
85 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n";
86 print STDERR " (default html)\n";
[3350]87 print STDERR " -use_strings if set a simple strings function\n";
88 print STDERR " will be called to extract text\n";
89 print STDERR " if the conversion utility fails\n";
[1410]90}
91
92sub parse_args
93{
94 my $class = shift (@_);
95 my ($args) = @_;
96
97 my $plugin_name = $class;
98 $plugin_name =~ s/\.pm$//;
[1415]99
[3350]100 my $newargs = {};
[1954]101
102 if (!parsargv::parse($args,
[4744]103 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented)
104 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented)
[3350]105 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
106 q^use_strings^, \$newargs->{'use_strings'},
[1410]107 "allow_extra_options")) {
108
109 print STDERR "\nIncorrect options passed to $plugin_name, ";
110 print STDERR "check your collect.cfg configuration file\n";
111 &print_usage($plugin_name);
112 die "\n";
113 }
[1954]114
[3350]115 return ($plugin_name, $newargs);
[1410]116}
117
118sub new {
119 my $class = shift (@_);
[4744]120 # print "Class: " . $class . "\n";
121 # if ($class eq "ConvertToPlug") {$class = shift (@_);}
[2086]122 my $self;
[2241]123 # parsargv::parse might modify the list, so we do this by creating a copy
124 # of the argument list.
[2086]125 my @arglist = @_;
[3350]126 my ($plugin_name, $args) = $class->parse_args(\@_);
[1410]127
[3350]128 if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
[2241]129 $ENV{'GSDLOS'} =~ /^windows$/i) {
130 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
[3350]131 $args->{'generate_format'} = "html";
[2241]132 }
133
[3350]134 if ($args->{'generate_format'} eq "text")
[1410]135 {
[2086]136 $self = new TEXTPlug ($class, @arglist);
[1435]137 $self->{'convert_to'} = "TEXT";
138 $self->{'convert_to_ext'} = "txt";
[1410]139 }
140 else
141 {
[2086]142 $self = new HTMLPlug ($class, @arglist);
[1435]143 $self->{'convert_to'} = "HTML";
144 $self->{'convert_to_ext'} = "html";
[1446]145
146 $self->{'rename_assoc_files'} = 1;
147 $self->{'metadata_fields'} .= ",GENERATOR";
[1410]148 }
149
[3540]150 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
151 my $option_list = $self->{'option_list'};
152 push( @{$option_list}, $options );
153
[3350]154 foreach my $key (keys %$args) {
155 $self->{$key} = $args->{$key};
156 }
[1954]157
[1410]158 return bless $self, $class;
159}
160
161
[1435]162
[1446]163# Run conversion utility on the input file.
164#
165# The conversion takes place in a collection specific 'tmp' directory so
166# that we don't accidentally damage the input.
167#
168# The desired output type is indicated by $output_ext. This is usually
169# something like "html" or "word", but can be "best" (or the empty string)
170# to indicate that the conversion utility should do the best it can.
[1435]171
172sub tmp_area_convert_file {
173 my $self = shift (@_);
[2241]174 my ($output_ext, $input_filename, $textref) = @_;
[1435]175
[2515]176 my $outhandle = $self->{'outhandle'};
[1446]177 my $convert_to = $self->{'convert_to'};
[2785]178 my $failhandle = $self->{'failhandle'};
[1446]179
[1435]180 # softlink to collection tmp dir
181 my $tmp_dirname
[2515]182 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
[1435]183 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
184
185 # derive tmp filename from input filename
[2241]186 my ($tailname, $dirname, $suffix)
187 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[1705]188
[1435]189 # Remove any white space from filename -- no risk of name collision, and
[2041]190 # makes later conversion by utils simpler. Leave spaces in path...
191 $tailname =~ s/\s+//g;
[1435]192
[2241]193 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
[2041]194
[2241]195 &util::soft_link($input_filename, $tmp_filename);
[1435]196
197 my $verbosity = $self->{'verbosity'};
[2241]198 if ($verbosity > 0) {
[2515]199 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
[1435]200 }
[2755]201
202 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
[1435]203
[1446]204 # Execute the conversion command and get the type of the result,
205 # making sure the converter gives us the appropriate output type
206 my $output_type = lc($convert_to);
[3350]207 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
[3720]208 if (defined $self->{'convert_options'}) {
209 $cmd .= $self->{'convert_options'} . " ";
210 }
[3350]211 if ($self->{'use_strings'}) {
212 $cmd .= "-use_strings ";
213 }
214 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
[3720]215
[1446]216 $output_type = `$cmd`;
217
[2432]218 # remove symbolic link to original file
219 &util::rm($tmp_filename);
220
221 # Check STDERR here
[1446]222 chomp $output_type;
223 if ($output_type eq "fail") {
[2515]224 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
[2785]225 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
226 $self->{'num_not_processed'} ++;
227 if (-s "$errlog") {
[2755]228 open(ERRLOG, "$errlog");
229 while (<ERRLOG>) {
[2785]230 print $outhandle "$_";
[2755]231 }
[2785]232 print $outhandle "\n";
[2755]233 close ERRLOG;
234 }
235 &util::rm("$errlog") if (-e "$errlog");
[1691]236 return "";
[1435]237 }
238
[1446]239 # store the *actual* output type and return the output filename
[2980]240 # it's possible we requested conversion to html, but only to text succeeded
241
[1446]242 $self->{'convert_to_ext'} = $output_type;
[2980]243 if ($output_type =~ /html/i) {
244 $self->{'converted_to'} = "HTML";
245 } elsif ($output_type =~ /te?xt/i) {
246 $self->{'converted_to'} = "TEXT";
247 }
[1446]248 my $output_filename = $tmp_filename;
[2241]249
[1446]250 $output_filename =~ s/$suffix$/.$output_type/;
251
[1435]252 return $output_filename;
253}
254
255
256# Remove collection specific tmp directory and all its contents.
257
258sub cleanup_tmp_area {
259 my $self = shift (@_);
260
261 my $tmp_dirname
[2515]262 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
[1435]263 &util::rm_r($tmp_dirname);
264 &util::mk_dir($tmp_dirname);
265}
266
267
268
[1420]269
270# Override BasPlug read
[2027]271# We don't want to get language encoding stuff until after we've converted
272# our file to either TEXT or HTML.
[1420]273sub read {
274 my $self = shift (@_);
[2027]275 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
276# if ($self->is_recursive()) {
277# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
278# }
279
280 my $outhandle = $self->{'outhandle'};
281
[2796]282 my $filename = $file;
283 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
284
[2785]285 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
286 $self->{'num_blocked'} ++;
287 return 0;
288 }
[2027]289 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
290 return undef;
291 }
292 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
293
294 # read in file ($text will be in utf8)
295 my $text = "";
296
297 my $output_ext = $self->{'convert_to_ext'};
[2241]298 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
299
[2029]300 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
[2041]301 if (! -e "$conv_filename") {return 0;} # allows continue on errors
[2027]302 $self->{'conv_filename'} = $conv_filename;
303
[2241]304 # Do encoding stuff
[2811]305 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[2799]306
[2735]307 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
[2027]308 if (!length ($text)) {
[2811]309 my $plugin_name = ref ($self);
[2027]310 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
311 return 0;
312 }
313
[3248]314 # if we converted to HTML, convert &eacute; and etc to utf-8.
315 # this should really happen before language_extraction, but that means
316 # modifying a file on disk...
317 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
318
[2027]319 # create a new document
320 my $doc_obj = new doc ($conv_filename, "indexed_doc");
[2327]321 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
[2751]322 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
323 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[2796]324 my ($filemeta) = $file =~ /([^\\\/]+)$/;
325 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
[2816]326 if ($self->{'cover_image'}) {
327 $self->associate_cover_image($doc_obj, $filename);
328 }
[2027]329
330 # include any metadata passed in from previous plugins
331 # note that this metadata is associated with the top level section
332 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
333 # do plugin specific processing of doc_obj
334 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
335 # do any automatic metadata extraction
336 $self->auto_extract_metadata ($doc_obj);
337 # add an OID
338 $doc_obj->set_OID();
339 # process the document
340 $processor->process($doc_obj);
[1420]341 $self->cleanup_tmp_area();
[2027]342
[2785]343 $self->{'num_processed'} ++;
[2027]344
345 return 1;
[1420]346}
347
348
[1410]349# do plugin specific processing of doc_obj for HTML type
350sub process_type {
351 my $self = shift (@_);
352 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
353
354 my $conv_filename = $self->{'conv_filename'};
355 my $tmp_dirname = File::Basename::dirname($conv_filename);
356 my $tmp_tailname = File::Basename::basename($conv_filename);
[1929]357
[2980]358 my $converted_to = $self->{'converted_to'};
[1410]359 my $ret_val;
360
[2980]361 if ($converted_to eq "TEXT")
[1410]362 {
[1446]363
[2241]364 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
365 $tmp_dirname, $tmp_tailname,
366 $metadata, $doc_obj);
[1410]367 }
368 else
369 {
[2241]370 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
371 $tmp_dirname, $tmp_tailname,
372 $metadata, $doc_obj);
[1410]373 }
374
375 # associate original file with doc object
376 my $cursection = $doc_obj->get_top_section();
[2241]377 my $filename = &util::filename_cat($base_dir, $file);
[1410]378 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
379
[3038]380 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
[1435]381 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
[1410]382 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
383 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
384 return $ret_val;
385}
386
3871;
Note: See TracBrowser for help on using the repository browser.