source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51sub print_usage {
52 my ($plugin_name) = @_;
53
54 # for when this function is called directly by pluginfo.pl
55 if (ref ($plugin_name)) {
56 $plugin_name = ref ($plugin_name);
57 }
58
59 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
60 print STDERR " options:\n";
61 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n";
62 print STDERR " (default html)\n";
63}
64
65sub parse_args
66{
67 my $class = shift (@_);
68 my ($args) = @_;
69
70 my $plugin_name = $class;
71 $plugin_name =~ s/\.pm$//;
72
73 my $generate_format;
74 my $kea_arg;
75
76 if (!parsargv::parse($args,
77 q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
78 q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
79 q^convert_to/(html|text)/html^, \$generate_format,
80 "allow_extra_options")) {
81
82 print STDERR "\nIncorrect options passed to $plugin_name, ";
83 print STDERR "check your collect.cfg configuration file\n";
84 &print_usage($plugin_name);
85 die "\n";
86 }
87
88 return ($plugin_name,$generate_format, $kea_arg);
89}
90
91sub new {
92 my $class = shift (@_);
93 if ($class eq "ConvertToPlug") {$class = shift (@_);}
94 my $self;
95 # parsargv::parse might modify the list, so we do this by creating a copy
96 # of the argument list.
97 my @arglist = @_;
98 my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_);
99
100 if ($class eq "PDFPlug" && $generate_format eq "text" &&
101 $ENV{'GSDLOS'} =~ /^windows$/i) {
102 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
103 $generate_format = "html";
104 }
105
106 if ($generate_format eq "text")
107 {
108 $self = new TEXTPlug ($class, @arglist);
109 $self->{'convert_to'} = "TEXT";
110 $self->{'convert_to_ext'} = "txt";
111 }
112 else
113 {
114 $self = new HTMLPlug ($class, @arglist);
115 $self->{'convert_to'} = "HTML";
116 $self->{'convert_to_ext'} = "html";
117
118 $self->{'rename_assoc_files'} = 1;
119 $self->{'metadata_fields'} .= ",GENERATOR";
120 }
121
122 #if kea data to be extracted...
123 $self->{'kea'} = 1 if($kea_arg->{'kea'});
124 $self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
125
126 return bless $self, $class;
127}
128
129
130
131# Run conversion utility on the input file.
132#
133# The conversion takes place in a collection specific 'tmp' directory so
134# that we don't accidentally damage the input.
135#
136# The desired output type is indicated by $output_ext. This is usually
137# something like "html" or "word", but can be "best" (or the empty string)
138# to indicate that the conversion utility should do the best it can.
139
140sub tmp_area_convert_file {
141 my $self = shift (@_);
142 my ($output_ext, $input_filename, $textref) = @_;
143
144 my $outhandle = $self->{'outhandle'};
145 my $convert_to = $self->{'convert_to'};
146 my $failhandle = $self->{'failhandle'};
147
148 # softlink to collection tmp dir
149 my $tmp_dirname
150 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
151 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
152
153 # derive tmp filename from input filename
154 my ($tailname, $dirname, $suffix)
155 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156
157 # Remove any white space from filename -- no risk of name collision, and
158 # makes later conversion by utils simpler. Leave spaces in path...
159 $tailname =~ s/\s+//g;
160
161 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
162
163 &util::soft_link($input_filename, $tmp_filename);
164
165 my $verbosity = $self->{'verbosity'};
166 if ($verbosity > 0) {
167 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
168 }
169
170 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
171
172 # Execute the conversion command and get the type of the result,
173 # making sure the converter gives us the appropriate output type
174 my $output_type = lc($convert_to);
175 my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
176 $output_type = `$cmd`;
177
178 # remove symbolic link to original file
179 &util::rm($tmp_filename);
180
181 # Check STDERR here
182 chomp $output_type;
183 if ($output_type eq "fail") {
184 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
185 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
186 $self->{'num_not_processed'} ++;
187 if (-s "$errlog") {
188 open(ERRLOG, "$errlog");
189 while (<ERRLOG>) {
190 print $outhandle "$_";
191 }
192 print $outhandle "\n";
193 close ERRLOG;
194 }
195 &util::rm("$errlog") if (-e "$errlog");
196 return "";
197 }
198
199 # store the *actual* output type and return the output filename
200 $self->{'convert_to_ext'} = $output_type;
201 my $output_filename = $tmp_filename;
202
203 $output_filename =~ s/$suffix$/.$output_type/;
204
205 return $output_filename;
206}
207
208
209# Remove collection specific tmp directory and all its contents.
210
211sub cleanup_tmp_area {
212 my $self = shift (@_);
213
214 my $tmp_dirname
215 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
216 &util::rm_r($tmp_dirname);
217 &util::mk_dir($tmp_dirname);
218}
219
220
221
222
223# Override BasPlug read
224# We don't want to get language encoding stuff until after we've converted
225# our file to either TEXT or HTML.
226sub read {
227 my $self = shift (@_);
228 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
229# if ($self->is_recursive()) {
230# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
231# }
232
233 my $outhandle = $self->{'outhandle'};
234
235 my $filename = &util::filename_cat($base_dir, $file);
236 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
237 $self->{'num_blocked'} ++;
238 return 0;
239 }
240 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
241 return undef;
242 }
243 my $plugin_name = ref ($self);
244 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
245
246 # read in file ($text will be in utf8)
247 my $text = "";
248
249 my $output_ext = $self->{'convert_to_ext'};
250 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
251
252 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
253 if (! -e "$conv_filename") {return 0;} # allows continue on errors
254 $self->{'conv_filename'} = $conv_filename;
255
256 # Do encoding stuff
257 my ($language, $encoding);
258 if ($self->{'input_encoding'} eq "auto") {
259 # use textcat to automatically work out the input encoding and language
260 ($language, $encoding) = $self->get_language_encoding ($conv_filename);
261 } elsif ($self->{'extract_language'}) {
262 # use textcat to get language metadata
263
264 my ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename);
265 $encoding = $self->{'input_encoding'};
266 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
267 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
268 print $outhandle "appears to be encoded as $extracted_encoding.\n";
269 }
270 } else {
271 $language = $self->{'default_language'};
272 $encoding = $self->{'input_encoding'};
273 }
274
275 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
276 if (!length ($text)) {
277 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
278 return 0;
279 }
280
281 # create a new document
282 my $doc_obj = new doc ($conv_filename, "indexed_doc");
283 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
284 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
285 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
286 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($file));
287
288
289 # include any metadata passed in from previous plugins
290 # note that this metadata is associated with the top level section
291 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
292 # do plugin specific processing of doc_obj
293 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
294 # do any automatic metadata extraction
295 $self->auto_extract_metadata ($doc_obj);
296 # add an OID
297 $doc_obj->set_OID();
298 # process the document
299 $processor->process($doc_obj);
300 $self->cleanup_tmp_area();
301
302 $self->{'num_processed'} ++;
303
304 return 1;
305}
306
307
308# do plugin specific processing of doc_obj for HTML type
309sub process_type {
310 my $self = shift (@_);
311 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
312
313 my $conv_filename = $self->{'conv_filename'};
314 my $tmp_dirname = File::Basename::dirname($conv_filename);
315 my $tmp_tailname = File::Basename::basename($conv_filename);
316
317 my $convert_to = $self->{'convert_to'};
318 my $ret_val;
319
320 if ($convert_to eq "TEXT")
321 {
322
323 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
324 $tmp_dirname, $tmp_tailname,
325 $metadata, $doc_obj);
326 }
327 else
328 {
329 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
330 $tmp_dirname, $tmp_tailname,
331 $metadata, $doc_obj);
332 }
333
334 # associate original file with doc object
335 my $cursection = $doc_obj->get_top_section();
336 my $filename = &util::filename_cat($base_dir, $file);
337 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
338
339 my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext>";
340 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
341 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
342 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
343 return $ret_val;
344}
345
3461;
Note: See TracBrowser for help on using the repository browser.