source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 3038

Last change on this file since 3038 was 3038, checked in by jrm21, 22 years ago

Put \" \" around href for srclink, in case the collection name has spaces
in it.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
4# on plugin argument convert_to
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The plugin is inherited by such plugins as WordPlug and PDFPlug.
29# It facilitates the conversion of these document types to either HTML
30# or TEXT by setting up variable that instruct ConvertToBasPlug
31# how to work.
32
33# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
34# the plugin argument 'convert_to'. If the argument is not present,
35# the default is to inherit HTMLPlug.
36
37
38package ConvertToPlug;
39
40use BasPlug;
41use HTMLPlug;
42use TEXTPlug;
43use ghtml;
44
45sub BEGIN {
46 @ISA = ('HTMLPlug');
47# @ISA = ('HTMLPlug', 'TEXTPlug');
48# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
49}
50
51sub print_usage {
52 my ($plugin_name) = @_;
53
54 # for when this function is called directly by pluginfo.pl
55 if (ref ($plugin_name)) {
56 $plugin_name = ref ($plugin_name);
57 }
58
59 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
60 print STDERR " options:\n";
61 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n";
62 print STDERR " (default html)\n";
63}
64
65sub parse_args
66{
67 my $class = shift (@_);
68 my ($args) = @_;
69
70 my $plugin_name = $class;
71 $plugin_name =~ s/\.pm$//;
72
73 my $generate_format;
74 my $kea_arg;
75
76 if (!parsargv::parse($args,
77 q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
78 q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
79 q^convert_to/(html|text)/html^, \$generate_format,
80 "allow_extra_options")) {
81
82 print STDERR "\nIncorrect options passed to $plugin_name, ";
83 print STDERR "check your collect.cfg configuration file\n";
84 &print_usage($plugin_name);
85 die "\n";
86 }
87
88 return ($plugin_name,$generate_format, $kea_arg);
89}
90
91sub new {
92 my $class = shift (@_);
93 if ($class eq "ConvertToPlug") {$class = shift (@_);}
94 my $self;
95 # parsargv::parse might modify the list, so we do this by creating a copy
96 # of the argument list.
97 my @arglist = @_;
98 my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_);
99
100 if ($class eq "PDFPlug" && $generate_format eq "text" &&
101 $ENV{'GSDLOS'} =~ /^windows$/i) {
102 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
103 $generate_format = "html";
104 }
105
106 if ($generate_format eq "text")
107 {
108 $self = new TEXTPlug ($class, @arglist);
109 $self->{'convert_to'} = "TEXT";
110 $self->{'convert_to_ext'} = "txt";
111 }
112 else
113 {
114 $self = new HTMLPlug ($class, @arglist);
115 $self->{'convert_to'} = "HTML";
116 $self->{'convert_to_ext'} = "html";
117
118 $self->{'rename_assoc_files'} = 1;
119 $self->{'metadata_fields'} .= ",GENERATOR";
120 }
121
122 #if kea data to be extracted...
123 $self->{'kea'} = 1 if($kea_arg->{'kea'});
124 $self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
125
126 return bless $self, $class;
127}
128
129
130
131# Run conversion utility on the input file.
132#
133# The conversion takes place in a collection specific 'tmp' directory so
134# that we don't accidentally damage the input.
135#
136# The desired output type is indicated by $output_ext. This is usually
137# something like "html" or "word", but can be "best" (or the empty string)
138# to indicate that the conversion utility should do the best it can.
139
140sub tmp_area_convert_file {
141 my $self = shift (@_);
142 my ($output_ext, $input_filename, $textref) = @_;
143
144 my $outhandle = $self->{'outhandle'};
145 my $convert_to = $self->{'convert_to'};
146 my $failhandle = $self->{'failhandle'};
147
148 # softlink to collection tmp dir
149 my $tmp_dirname
150 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
151 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
152
153 # derive tmp filename from input filename
154 my ($tailname, $dirname, $suffix)
155 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156
157 # Remove any white space from filename -- no risk of name collision, and
158 # makes later conversion by utils simpler. Leave spaces in path...
159 $tailname =~ s/\s+//g;
160
161 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
162
163 &util::soft_link($input_filename, $tmp_filename);
164
165 my $verbosity = $self->{'verbosity'};
166 if ($verbosity > 0) {
167 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
168 }
169
170 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
171
172 # Execute the conversion command and get the type of the result,
173 # making sure the converter gives us the appropriate output type
174 my $output_type = lc($convert_to);
175 my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
176 $output_type = `$cmd`;
177
178 # remove symbolic link to original file
179 &util::rm($tmp_filename);
180
181 # Check STDERR here
182 chomp $output_type;
183 if ($output_type eq "fail") {
184 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
185 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
186 $self->{'num_not_processed'} ++;
187 if (-s "$errlog") {
188 open(ERRLOG, "$errlog");
189 while (<ERRLOG>) {
190 print $outhandle "$_";
191 }
192 print $outhandle "\n";
193 close ERRLOG;
194 }
195 &util::rm("$errlog") if (-e "$errlog");
196 return "";
197 }
198
199 # store the *actual* output type and return the output filename
200 # it's possible we requested conversion to html, but only to text succeeded
201
202 $self->{'convert_to_ext'} = $output_type;
203 if ($output_type =~ /html/i) {
204 $self->{'converted_to'} = "HTML";
205 } elsif ($output_type =~ /te?xt/i) {
206 $self->{'converted_to'} = "TEXT";
207 }
208 my $output_filename = $tmp_filename;
209
210 $output_filename =~ s/$suffix$/.$output_type/;
211
212 return $output_filename;
213}
214
215
216# Remove collection specific tmp directory and all its contents.
217
218sub cleanup_tmp_area {
219 my $self = shift (@_);
220
221 my $tmp_dirname
222 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
223 &util::rm_r($tmp_dirname);
224 &util::mk_dir($tmp_dirname);
225}
226
227
228
229
230# Override BasPlug read
231# We don't want to get language encoding stuff until after we've converted
232# our file to either TEXT or HTML.
233sub read {
234 my $self = shift (@_);
235 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
236# if ($self->is_recursive()) {
237# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
238# }
239
240 my $outhandle = $self->{'outhandle'};
241
242 my $filename = $file;
243 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
244
245 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
246 $self->{'num_blocked'} ++;
247 return 0;
248 }
249 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
250 return undef;
251 }
252 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
253
254 # read in file ($text will be in utf8)
255 my $text = "";
256
257 my $output_ext = $self->{'convert_to_ext'};
258 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
259
260 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
261 if (! -e "$conv_filename") {return 0;} # allows continue on errors
262 $self->{'conv_filename'} = $conv_filename;
263
264 # Do encoding stuff
265 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
266
267 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
268 if (!length ($text)) {
269 my $plugin_name = ref ($self);
270 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
271 return 0;
272 }
273
274 # create a new document
275 my $doc_obj = new doc ($conv_filename, "indexed_doc");
276 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
277 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
278 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
279 my ($filemeta) = $file =~ /([^\\\/]+)$/;
280 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
281 if ($self->{'cover_image'}) {
282 $self->associate_cover_image($doc_obj, $filename);
283 }
284
285 # include any metadata passed in from previous plugins
286 # note that this metadata is associated with the top level section
287 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
288 # do plugin specific processing of doc_obj
289 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
290 # do any automatic metadata extraction
291 $self->auto_extract_metadata ($doc_obj);
292 # add an OID
293 $doc_obj->set_OID();
294 # process the document
295 $processor->process($doc_obj);
296 $self->cleanup_tmp_area();
297
298 $self->{'num_processed'} ++;
299
300 return 1;
301}
302
303
304# do plugin specific processing of doc_obj for HTML type
305sub process_type {
306 my $self = shift (@_);
307 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
308
309 my $conv_filename = $self->{'conv_filename'};
310 my $tmp_dirname = File::Basename::dirname($conv_filename);
311 my $tmp_tailname = File::Basename::basename($conv_filename);
312
313 my $converted_to = $self->{'converted_to'};
314 my $ret_val;
315
316 if ($converted_to eq "TEXT")
317 {
318
319 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
320 $tmp_dirname, $tmp_tailname,
321 $metadata, $doc_obj);
322 }
323 else
324 {
325 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
326 $tmp_dirname, $tmp_tailname,
327 $metadata, $doc_obj);
328 }
329
330 # associate original file with doc object
331 my $cursection = $doc_obj->get_top_section();
332 my $filename = &util::filename_cat($base_dir, $file);
333 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
334
335 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
336 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
337 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
338 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
339 return $ret_val;
340}
341
3421;
Note: See TracBrowser for help on using the repository browser.