source: trunk/gsdl/perllib/plugins/ConvertToRogPlug.pm@ 9853

Last change on this file since 9853 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlug.pm -- plugin that inherits from RogPlug
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlug;
29
30use BasPlug;
31use RogPlug;
32
33
34sub BEGIN {
35 @ISA = ('RogPlug');
36}
37
38my $options = { 'name' => "ConvertToRogPlug",
39 'desc' => "{ConvertToRogPlug.desc}",
40 'abstract' => "yes",
41 'inherits' => "yes" };
42
43
44sub parse_args
45{
46 my $class = shift (@_);
47 my ($args) = @_;
48
49 my $plugin_name = $class;
50 $plugin_name =~ s/\.pm$//;
51
52 if (!parsargv::parse($args,
53 "allow_extra_options")) {
54
55 print STDERR "\nIncorrect options passed to $plugin_name, ";
56 print STDERR "check your collect.cfg configuration file\n";
57 $self->print_txt_usage(""); # Use default resource bundle
58 die "\n";
59 }
60
61 return ($plugin_name);
62}
63
64sub new {
65 my $class = shift (@_);
66 if ($class eq "ConvertToRogPlug" && defined $_[0]) {$class = shift (@_);}
67 my $self;
68 # parsargv::parse might modify the list, so we do this by creating a copy
69 # of the argument list.
70 my @arglist = @_;
71 my ($plugin_name) = $class->parse_args(\@_);
72
73 $self = new RogPlug ($class, @arglist);
74 $self->{'convert_to'} = "Rog";
75 $self->{'convert_to_ext'} = "rog";
76 $self->{'plugin_type'} = "ConvertToRogPlug";
77 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
78 my $option_list = $self->{'option_list'};
79 push( @{$option_list}, $options );
80
81 return bless $self, $class;
82}
83
84
85sub begin {
86 my $self = shift (@_);
87
88 $self->SUPER::begin(@_);
89
90 $self->{'docnum'} = 0;
91}
92
93sub end {
94 my ($self) = @_;
95
96 # nothing to do, but keep symmetric with begin function
97 $self->SUPER::end(@_);
98}
99
100
101# Run conversion utility on the input file.
102#
103# The conversion takes place in a collection specific 'tmp' directory so
104# that we don't accidentally damage the input.
105#
106# The desired output type is indicated by $output_ext. This is usually
107# something like "html" or "word", but can be "best" (or the empty string)
108# to indicate that the conversion utility should do the best it can.
109
110sub tmp_area_convert_file {
111 my $self = shift (@_);
112 my ($output_ext, $input_filename, $textref) = @_;
113
114 my $outhandle = $self->{'outhandle'};
115 my $convert_to = $self->{'convert_to'};
116 my $failhandle = $self->{'failhandle'};
117
118 # softlink to collection tmp dir
119 my $tmp_dirname
120 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
121 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
122
123 # derive tmp filename from input filename
124 my ($tailname, $dirname, $suffix)
125 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
126
127 # Remove any white space from filename -- no risk of name collision, and
128 # makes later conversion by utils simpler. Leave spaces in path...
129 $tailname =~ s/\s+//g;
130
131 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
132
133 &util::soft_link($input_filename, $tmp_filename);
134
135 my $verbosity = $self->{'verbosity'};
136 if ($verbosity > 0) {
137 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
138 }
139
140 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
141
142 # Execute the conversion command and get the type of the result,
143 # making sure the converter gives us the appropriate output type
144 my $output_type = lc($convert_to);
145 my $cmd = "perl -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
146 $output_type = `$cmd`;
147
148 # remove symbolic link to original file
149 &util::rm($tmp_filename);
150
151 # Check STDERR here
152 chomp $output_type;
153 if ($output_type eq "fail") {
154 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
155 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
156 $self->{'num_not_processed'} ++;
157 if (-s "$errlog") {
158 open(ERRLOG, "$errlog");
159 while (<ERRLOG>) {
160 print $outhandle "$_";
161 }
162 print $outhandle "\n";
163 close ERRLOG;
164 }
165 &util::rm("$errlog") if (-e "$errlog");
166 return "";
167 }
168
169 # store the *actual* output type and return the output filename
170 # it's possible we requested conversion to html, but only to text succeeded
171
172 $self->{'convert_to_ext'} = $output_type;
173 $self->{'converted_to'} = "Rog";
174
175 my $output_filename = $tmp_filename;
176
177 $output_filename =~ s/$suffix$//;
178
179 return $output_filename;
180}
181
182
183# Remove collection specific tmp directory and all its contents.
184
185sub cleanup_tmp_area {
186 my $self = shift (@_);
187
188 my $tmp_dirname
189 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
190 &util::rm_r($tmp_dirname);
191 &util::mk_dir($tmp_dirname);
192}
193
194
195# Exact copy of read_rog_record from RogPlug
196# Needed for FILE in right scope
197
198sub read_rog_record
199{
200 my ($self,$file_buffer, $docnum, $seclevel) = @_;
201
202 my $next_line = $file_buffer->{'next_line'};
203
204 return 0 if (!defined $next_line);
205
206 if ($next_line eq "")
207 {
208 my $line;
209 while(defined($line=<FILE>))
210 {
211 $line =~ s/\r$//;
212 $file_buffer->{'line_no'}++;
213 next if ($line =~ m/^\#/);
214 $next_line = $line;
215 last;
216 }
217 }
218
219 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
220 {
221 print STDERR "Error: Malformed Rog file: $next_line";
222 return 0;
223 }
224 else
225 {
226 # init default values
227 $file_buffer->{'song'}->{'tempo'} = 120;
228 $file_buffer->{'song'}->{'ks_type'} = 0;
229 $file_buffer->{'song'}->{'ks_num'} = 0;
230 $file_buffer->{'song'}->{'metadata'} = [];
231 $file_buffer->{'song'}->{'content'} = "";
232
233 $file_buffer->{'song'}->{'subcol'} = $2;
234 $file_buffer->{'song'}->{'title'} = $4;
235 $file_buffer->{'song'}->{'tval'} = $6;
236
237 chomp($next_line);
238 my $content = $next_line;
239 if (defined $docnum)
240 {
241 $content.= " $docnum $seclevel";
242 }
243 $content .= "\n";
244
245 $file_buffer->{'song'}->{'content'} = $content;
246
247
248 my $line;
249 while(defined($line=<FILE>))
250 {
251 $line =~ s/\r$//;
252
253 $file_buffer->{'line_no'}++;
254 next if ($line =~ m/^\#/);
255
256 if ($line =~ m/^song/)
257 {
258 $file_buffer->{'next_line'} = $line;
259 return 1;
260 }
261 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
262 {
263 $file_buffer->{'song'}->{'tempo'} = $2;
264 $file_buffer->{'song'}->{'content'} .= $line;
265 }
266 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
267 {
268 $file_buffer->{'song'}->{'ks_type'} = $2;
269 $file_buffer->{'song'}->{'ks_num'} = $4;
270 $file_buffer->{'song'}->{'content'} .= $line;
271 }
272 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
273 {
274 $file_buffer->{'song'}->{'ts_numer'} = $2;
275 $file_buffer->{'song'}->{'ts_denom'} = $4;
276 $file_buffer->{'song'}->{'content'} .= $line;
277 }
278 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
279 {
280 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
281 $file_buffer->{'song'}->{'content'} .= $line;
282 }
283 else
284 {
285 $file_buffer->{'song'}->{'content'} .= $line;
286 }
287 }
288
289 $file_buffer->{'next_line'} = undef;
290 }
291
292 return 1;
293}
294
295# Override RogPlug function so rog files are stored as sections (not docs)
296
297sub process_rog_record
298{
299 my ($self,$doc_obj,$cursection,$song) = @_;
300
301 $cursection =
302 $doc_obj->insert_section($cursection);
303 $self->{'docnum'}++;
304
305 my $title = $song->{'title'};
306 my $title_safe = $title;
307 $title_safe =~ s/\'/\\\\&apos;/g;
308
309 # add metadata
310 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
311 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
312 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
313 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
314 $doc_obj->add_metadata($cursection, "Title", $title);
315 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
316 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
317
318 foreach $md ( @{$song->{'metadata'}} )
319 {
320 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
321 }
322
323 # add contents as text
324 $doc_obj->add_text($cursection,$song->{'content'});
325
326 return $cursection;
327}
328
329
330
331# Override BasPlug read
332# We don't want to get language encoding stuff until after we've converted
333# our file to Rog format
334sub read {
335 my $self = shift (@_);
336 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
337# if ($self->is_recursive()) {
338# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
339# }
340
341 my $outhandle = $self->{'outhandle'};
342
343 my $filename = $file;
344 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
345
346 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
347 $self->{'num_blocked'} ++;
348 return 0;
349 }
350 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
351 return undef;
352 }
353 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
354
355 # read in file ($text will be in utf8)
356 my $text = "";
357
358 my $output_ext = $self->{'convert_to_ext'};
359 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
360
361 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
362 $self->{'conv_filename'} = $conv_filename;
363
364
365 # create a new document
366 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
367 # the original filename is used now
368 my $doc_obj = new doc ($filename, "indexed_doc");
369 # the converted filename is set separately
370 $doc_obj->set_converted_filename($conv_filename);
371
372 my $topsection = $doc_obj->get_top_section();
373 my $cursection = $topsection;
374
375 $self->{'docnum'}++;
376 my $docnum = $self->{'docnum'};
377
378 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
379 my ($filemeta) = $file =~ /([^\\\/]+)$/;
380 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
381 if ($self->{'cover_image'}) {
382 $self->associate_cover_image($doc_obj, $filename);
383 }
384 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
385 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
386
387 my $track_no = "1";
388 my $rog_filename = "$conv_filename$track_no.$output_ext";
389 while (1)
390 {
391 last unless open (FILE, $rog_filename) ;
392
393 my $file_buffer = { line_no => 0, next_line => "", song => {} };
394
395 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
396 {
397 my $song = $file_buffer->{'song'};
398 my $content = $song->{'content'};
399 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
400
401 $cursection
402 = $self->process_rog_record($doc_obj,$cursection,
403 $file_buffer->{'song'});
404 }
405
406 close FILE;
407
408 $track_no++;
409 $rog_filename = "$conv_filename$track_no.$output_ext";
410 }
411
412 print STDERR "\n";
413
414 # include any metadata passed in from previous plugins
415 # note that this metadata is associated with the top level section
416 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
417 # do plugin specific processing of doc_obj
418 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
419 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
420 return -1;
421 }
422 # do any automatic metadata extraction
423 $self->auto_extract_metadata ($doc_obj);
424 # add an OID
425 $doc_obj->set_OID();
426
427 my $oid = $doc_obj->get_OID();
428 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
429
430 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
431 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
432 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
433
434 # if no title metadata defined, set it to filename minus extension
435 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
436 if (!defined $existing_title)
437 {
438 my $title = $doc_obj->get_metadata_element($topsection,"Source");
439 $title =~ s/\..*?$//g;
440 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
441
442 my $title_safe = $title;
443 $title_safe =~ s/\'/\\\\&apos;/g;
444 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
445 }
446
447 # process the document
448 $processor->process($doc_obj);
449 $self->cleanup_tmp_area();
450
451 $self->{'num_processed'} ++;
452
453 return 1;
454}
455
456
457# do plugin specific processing of doc_obj for HTML type
458sub process_type {
459 my $self = shift (@_);
460 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
461
462 my $conv_filename = $self->{'conv_filename'};
463 my $tmp_dirname = File::Basename::dirname($conv_filename);
464 my $tmp_tailname = File::Basename::basename($conv_filename);
465
466 my $converted_to = $self->{'converted_to'};
467 my $ret_val = 1;
468
469# $ret_val = &RogPlug::process($self, $textref, $pluginfo,
470# $tmp_dirname, $tmp_tailname,
471# $metadata, $doc_obj);
472
473 # associate original file with doc object
474 my $cursection = $doc_obj->get_top_section();
475 my $filename = &util::filename_cat($base_dir, $file);
476 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
477
478 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
479 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
480 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
481 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
482 return $ret_val;
483}
484
4851;
Note: See TracBrowser for help on using the repository browser.