source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 3350

Last change on this file since 3350 was 3350, checked in by sjboddie, 22 years ago

Added -use_strings option to ConvertToPlug. The default behaviour for
plugins derived from ConvertToPlug (WordPlug, PDFPlug etc) is now to
exclude documents that can't be converted correctly. They won't use the
perl strings stuff to extract text unless the -use_strings option is
specified.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51sub print_usage {
52 my ($plugin_name) = @_;
53
54 # for when this function is called directly by pluginfo.pl
55 if (ref ($plugin_name)) {
56 $plugin_name = ref ($plugin_name);
57 }
58
59 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
60 print STDERR " options:\n";
61 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n";
62 print STDERR " (default html)\n";
63 print STDERR " -use_strings if set a simple strings function\n";
64 print STDERR " will be called to extract text\n";
65 print STDERR " if the conversion utility fails\n";
66}
67
68sub parse_args
69{
70 my $class = shift (@_);
71 my ($args) = @_;
72
73 my $plugin_name = $class;
74 $plugin_name =~ s/\.pm$//;
75
76 my $newargs = {};
77
78 if (!parsargv::parse($args,
79 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
80 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
81 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
82 q^use_strings^, \$newargs->{'use_strings'},
83 "allow_extra_options")) {
84
85 print STDERR "\nIncorrect options passed to $plugin_name, ";
86 print STDERR "check your collect.cfg configuration file\n";
87 &print_usage($plugin_name);
88 die "\n";
89 }
90
91 return ($plugin_name, $newargs);
92}
93
94sub new {
95 my $class = shift (@_);
96 if ($class eq "ConvertToPlug") {$class = shift (@_);}
97 my $self;
98 # parsargv::parse might modify the list, so we do this by creating a copy
99 # of the argument list.
100 my @arglist = @_;
101 my ($plugin_name, $args) = $class->parse_args(\@_);
102
103 if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
104 $ENV{'GSDLOS'} =~ /^windows$/i) {
105 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
106 $args->{'generate_format'} = "html";
107 }
108
109 if ($args->{'generate_format'} eq "text")
110 {
111 $self = new TEXTPlug ($class, @arglist);
112 $self->{'convert_to'} = "TEXT";
113 $self->{'convert_to_ext'} = "txt";
114 }
115 else
116 {
117 $self = new HTMLPlug ($class, @arglist);
118 $self->{'convert_to'} = "HTML";
119 $self->{'convert_to_ext'} = "html";
120
121 $self->{'rename_assoc_files'} = 1;
122 $self->{'metadata_fields'} .= ",GENERATOR";
123 }
124
125 foreach my $key (keys %$args) {
126 $self->{$key} = $args->{$key};
127 }
128
129 return bless $self, $class;
130}
131
132
133
134# Run conversion utility on the input file.
135#
136# The conversion takes place in a collection specific 'tmp' directory so
137# that we don't accidentally damage the input.
138#
139# The desired output type is indicated by $output_ext. This is usually
140# something like "html" or "word", but can be "best" (or the empty string)
141# to indicate that the conversion utility should do the best it can.
142
143sub tmp_area_convert_file {
144 my $self = shift (@_);
145 my ($output_ext, $input_filename, $textref) = @_;
146
147 my $outhandle = $self->{'outhandle'};
148 my $convert_to = $self->{'convert_to'};
149 my $failhandle = $self->{'failhandle'};
150
151 # softlink to collection tmp dir
152 my $tmp_dirname
153 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
154 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
155
156 # derive tmp filename from input filename
157 my ($tailname, $dirname, $suffix)
158 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
159
160 # Remove any white space from filename -- no risk of name collision, and
161 # makes later conversion by utils simpler. Leave spaces in path...
162 $tailname =~ s/\s+//g;
163
164 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
165
166 &util::soft_link($input_filename, $tmp_filename);
167
168 my $verbosity = $self->{'verbosity'};
169 if ($verbosity > 0) {
170 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
171 }
172
173 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
174
175 # Execute the conversion command and get the type of the result,
176 # making sure the converter gives us the appropriate output type
177 my $output_type = lc($convert_to);
178 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
179 if ($self->{'use_strings'}) {
180 $cmd .= "-use_strings ";
181 }
182 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
183 $output_type = `$cmd`;
184
185 # remove symbolic link to original file
186 &util::rm($tmp_filename);
187
188 # Check STDERR here
189 chomp $output_type;
190 if ($output_type eq "fail") {
191 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
192 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
193 $self->{'num_not_processed'} ++;
194 if (-s "$errlog") {
195 open(ERRLOG, "$errlog");
196 while (<ERRLOG>) {
197 print $outhandle "$_";
198 }
199 print $outhandle "\n";
200 close ERRLOG;
201 }
202 &util::rm("$errlog") if (-e "$errlog");
203 return "";
204 }
205
206 # store the *actual* output type and return the output filename
207 # it's possible we requested conversion to html, but only to text succeeded
208
209 $self->{'convert_to_ext'} = $output_type;
210 if ($output_type =~ /html/i) {
211 $self->{'converted_to'} = "HTML";
212 } elsif ($output_type =~ /te?xt/i) {
213 $self->{'converted_to'} = "TEXT";
214 }
215 my $output_filename = $tmp_filename;
216
217 $output_filename =~ s/$suffix$/.$output_type/;
218
219 return $output_filename;
220}
221
222
223# Remove collection specific tmp directory and all its contents.
224
225sub cleanup_tmp_area {
226 my $self = shift (@_);
227
228 my $tmp_dirname
229 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
230 &util::rm_r($tmp_dirname);
231 &util::mk_dir($tmp_dirname);
232}
233
234
235
236
237# Override BasPlug read
238# We don't want to get language encoding stuff until after we've converted
239# our file to either TEXT or HTML.
240sub read {
241 my $self = shift (@_);
242 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
243# if ($self->is_recursive()) {
244# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
245# }
246
247 my $outhandle = $self->{'outhandle'};
248
249 my $filename = $file;
250 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
251
252 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
253 $self->{'num_blocked'} ++;
254 return 0;
255 }
256 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
257 return undef;
258 }
259 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
260
261 # read in file ($text will be in utf8)
262 my $text = "";
263
264 my $output_ext = $self->{'convert_to_ext'};
265 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
266
267 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
268 if (! -e "$conv_filename") {return 0;} # allows continue on errors
269 $self->{'conv_filename'} = $conv_filename;
270
271 # Do encoding stuff
272 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
273
274 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
275 if (!length ($text)) {
276 my $plugin_name = ref ($self);
277 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
278 return 0;
279 }
280
281 # if we converted to HTML, convert &eacute; and etc to utf-8.
282 # this should really happen before language_extraction, but that means
283 # modifying a file on disk...
284 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
285
286 # create a new document
287 my $doc_obj = new doc ($conv_filename, "indexed_doc");
288 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
289 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
290 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
291 my ($filemeta) = $file =~ /([^\\\/]+)$/;
292 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
293 if ($self->{'cover_image'}) {
294 $self->associate_cover_image($doc_obj, $filename);
295 }
296
297 # include any metadata passed in from previous plugins
298 # note that this metadata is associated with the top level section
299 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
300 # do plugin specific processing of doc_obj
301 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
302 # do any automatic metadata extraction
303 $self->auto_extract_metadata ($doc_obj);
304 # add an OID
305 $doc_obj->set_OID();
306 # process the document
307 $processor->process($doc_obj);
308 $self->cleanup_tmp_area();
309
310 $self->{'num_processed'} ++;
311
312 return 1;
313}
314
315
316# do plugin specific processing of doc_obj for HTML type
317sub process_type {
318 my $self = shift (@_);
319 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
320
321 my $conv_filename = $self->{'conv_filename'};
322 my $tmp_dirname = File::Basename::dirname($conv_filename);
323 my $tmp_tailname = File::Basename::basename($conv_filename);
324
325 my $converted_to = $self->{'converted_to'};
326 my $ret_val;
327
328 if ($converted_to eq "TEXT")
329 {
330
331 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
332 $tmp_dirname, $tmp_tailname,
333 $metadata, $doc_obj);
334 }
335 else
336 {
337 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
338 $tmp_dirname, $tmp_tailname,
339 $metadata, $doc_obj);
340 }
341
342 # associate original file with doc object
343 my $cursection = $doc_obj->get_top_section();
344 my $filename = &util::filename_cat($base_dir, $file);
345 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
346
347 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
348 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
349 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
350 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
351 return $ret_val;
352}
353
3541;
Note: See TracBrowser for help on using the repository browser.