source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 3767

Last change on this file since 3767 was 3720, checked in by sjboddie, 21 years ago

Added options to PDFPlug to take advantage of the improvements in
version 0.34 of pdftohtml. It now works much better for non latin
input documents (producing UTF-8 encoded HTML).

  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51my $convert_to_list =
52[ { 'name' => "html",
53 'desc' => "" },
54{ 'name' => "text",
55 'desc' => "" }
56];
57
58my $arguments =
59[ { 'name' => "convert_to",
60 'desc' => "Plugin converts to TEXT or HTML (default html).",
61 'type' => "enum",
62 'reqd' => "no",
63 'list' => $convert_to_list,
64 'deft' => "html"}
65];
66
67my $options =
68{ 'name' => "ConvertToPlug",
69 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.",
70 'inherits' => "Yes",
71 'args' => $arguments };
72
73
74sub print_usage {
75 my ($plugin_name) = @_;
76
77 # for when this function is called directly by pluginfo.pl
78 if (ref ($plugin_name)) {
79 $plugin_name = ref ($plugin_name);
80 }
81
82 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
83 print STDERR " options:\n";
84 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n";
85 print STDERR " (default html)\n";
86 print STDERR " -use_strings if set a simple strings function\n";
87 print STDERR " will be called to extract text\n";
88 print STDERR " if the conversion utility fails\n";
89}
90
91sub parse_args
92{
93 my $class = shift (@_);
94 my ($args) = @_;
95
96 my $plugin_name = $class;
97 $plugin_name =~ s/\.pm$//;
98
99 my $newargs = {};
100
101 if (!parsargv::parse($args,
102 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
103 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
104 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
105 q^use_strings^, \$newargs->{'use_strings'},
106 "allow_extra_options")) {
107
108 print STDERR "\nIncorrect options passed to $plugin_name, ";
109 print STDERR "check your collect.cfg configuration file\n";
110 &print_usage($plugin_name);
111 die "\n";
112 }
113
114 return ($plugin_name, $newargs);
115}
116
117sub new {
118 my $class = shift (@_);
119 if ($class eq "ConvertToPlug") {$class = shift (@_);}
120 my $self;
121 # parsargv::parse might modify the list, so we do this by creating a copy
122 # of the argument list.
123 my @arglist = @_;
124 my ($plugin_name, $args) = $class->parse_args(\@_);
125
126 if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
127 $ENV{'GSDLOS'} =~ /^windows$/i) {
128 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
129 $args->{'generate_format'} = "html";
130 }
131
132 if ($args->{'generate_format'} eq "text")
133 {
134 $self = new TEXTPlug ($class, @arglist);
135 $self->{'convert_to'} = "TEXT";
136 $self->{'convert_to_ext'} = "txt";
137 }
138 else
139 {
140 $self = new HTMLPlug ($class, @arglist);
141 $self->{'convert_to'} = "HTML";
142 $self->{'convert_to_ext'} = "html";
143
144 $self->{'rename_assoc_files'} = 1;
145 $self->{'metadata_fields'} .= ",GENERATOR";
146 }
147
148 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
149 my $option_list = $self->{'option_list'};
150 push( @{$option_list}, $options );
151
152 foreach my $key (keys %$args) {
153 $self->{$key} = $args->{$key};
154 }
155
156 return bless $self, $class;
157}
158
159
160
161# Run conversion utility on the input file.
162#
163# The conversion takes place in a collection specific 'tmp' directory so
164# that we don't accidentally damage the input.
165#
166# The desired output type is indicated by $output_ext. This is usually
167# something like "html" or "word", but can be "best" (or the empty string)
168# to indicate that the conversion utility should do the best it can.
169
170sub tmp_area_convert_file {
171 my $self = shift (@_);
172 my ($output_ext, $input_filename, $textref) = @_;
173
174 my $outhandle = $self->{'outhandle'};
175 my $convert_to = $self->{'convert_to'};
176 my $failhandle = $self->{'failhandle'};
177
178 # softlink to collection tmp dir
179 my $tmp_dirname
180 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
181 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
182
183 # derive tmp filename from input filename
184 my ($tailname, $dirname, $suffix)
185 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
186
187 # Remove any white space from filename -- no risk of name collision, and
188 # makes later conversion by utils simpler. Leave spaces in path...
189 $tailname =~ s/\s+//g;
190
191 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
192
193 &util::soft_link($input_filename, $tmp_filename);
194
195 my $verbosity = $self->{'verbosity'};
196 if ($verbosity > 0) {
197 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
198 }
199
200 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
201
202 # Execute the conversion command and get the type of the result,
203 # making sure the converter gives us the appropriate output type
204 my $output_type = lc($convert_to);
205 my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
206 if (defined $self->{'convert_options'}) {
207 $cmd .= $self->{'convert_options'} . " ";
208 }
209 if ($self->{'use_strings'}) {
210 $cmd .= "-use_strings ";
211 }
212 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
213
214 $output_type = `$cmd`;
215
216 # remove symbolic link to original file
217 &util::rm($tmp_filename);
218
219 # Check STDERR here
220 chomp $output_type;
221 if ($output_type eq "fail") {
222 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
223 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
224 $self->{'num_not_processed'} ++;
225 if (-s "$errlog") {
226 open(ERRLOG, "$errlog");
227 while (<ERRLOG>) {
228 print $outhandle "$_";
229 }
230 print $outhandle "\n";
231 close ERRLOG;
232 }
233 &util::rm("$errlog") if (-e "$errlog");
234 return "";
235 }
236
237 # store the *actual* output type and return the output filename
238 # it's possible we requested conversion to html, but only to text succeeded
239
240 $self->{'convert_to_ext'} = $output_type;
241 if ($output_type =~ /html/i) {
242 $self->{'converted_to'} = "HTML";
243 } elsif ($output_type =~ /te?xt/i) {
244 $self->{'converted_to'} = "TEXT";
245 }
246 my $output_filename = $tmp_filename;
247
248 $output_filename =~ s/$suffix$/.$output_type/;
249
250 return $output_filename;
251}
252
253
254# Remove collection specific tmp directory and all its contents.
255
256sub cleanup_tmp_area {
257 my $self = shift (@_);
258
259 my $tmp_dirname
260 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
261 &util::rm_r($tmp_dirname);
262 &util::mk_dir($tmp_dirname);
263}
264
265
266
267
268# Override BasPlug read
269# We don't want to get language encoding stuff until after we've converted
270# our file to either TEXT or HTML.
271sub read {
272 my $self = shift (@_);
273 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
274# if ($self->is_recursive()) {
275# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
276# }
277
278 my $outhandle = $self->{'outhandle'};
279
280 my $filename = $file;
281 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
282
283 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
284 $self->{'num_blocked'} ++;
285 return 0;
286 }
287 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
288 return undef;
289 }
290 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
291
292 # read in file ($text will be in utf8)
293 my $text = "";
294
295 my $output_ext = $self->{'convert_to_ext'};
296 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
297
298 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
299 if (! -e "$conv_filename") {return 0;} # allows continue on errors
300 $self->{'conv_filename'} = $conv_filename;
301
302 # Do encoding stuff
303 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
304
305 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
306 if (!length ($text)) {
307 my $plugin_name = ref ($self);
308 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
309 return 0;
310 }
311
312 # if we converted to HTML, convert &eacute; and etc to utf-8.
313 # this should really happen before language_extraction, but that means
314 # modifying a file on disk...
315 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
316
317 # create a new document
318 my $doc_obj = new doc ($conv_filename, "indexed_doc");
319 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
320 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
321 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
322 my ($filemeta) = $file =~ /([^\\\/]+)$/;
323 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
324 if ($self->{'cover_image'}) {
325 $self->associate_cover_image($doc_obj, $filename);
326 }
327
328 # include any metadata passed in from previous plugins
329 # note that this metadata is associated with the top level section
330 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
331 # do plugin specific processing of doc_obj
332 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
333 # do any automatic metadata extraction
334 $self->auto_extract_metadata ($doc_obj);
335 # add an OID
336 $doc_obj->set_OID();
337 # process the document
338 $processor->process($doc_obj);
339 $self->cleanup_tmp_area();
340
341 $self->{'num_processed'} ++;
342
343 return 1;
344}
345
346
347# do plugin specific processing of doc_obj for HTML type
348sub process_type {
349 my $self = shift (@_);
350 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
351
352 my $conv_filename = $self->{'conv_filename'};
353 my $tmp_dirname = File::Basename::dirname($conv_filename);
354 my $tmp_tailname = File::Basename::basename($conv_filename);
355
356 my $converted_to = $self->{'converted_to'};
357 my $ret_val;
358
359 if ($converted_to eq "TEXT")
360 {
361
362 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
363 $tmp_dirname, $tmp_tailname,
364 $metadata, $doc_obj);
365 }
366 else
367 {
368 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
369 $tmp_dirname, $tmp_tailname,
370 $metadata, $doc_obj);
371 }
372
373 # associate original file with doc object
374 my $cursection = $doc_obj->get_top_section();
375 my $filename = &util::filename_cat($base_dir, $file);
376 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
377
378 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
379 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
380 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
381 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
382 return $ret_val;
383}
384
3851;
Note: See TracBrowser for help on using the repository browser.