source: gsdl/trunk/perllib/plugins/ConvertToRogPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 @ConvertToRogPlugin::ISA = ('RogPlugin');
36}
37
38my $arguments = [
39 ];
40my $options = { 'name' => "ConvertToRogPlugin",
41 'desc' => "{ConvertToRogPlugin.desc}",
42 'abstract' => "yes",
43 'inherits' => "yes" };
44
45sub new {
46 my ($class) = shift (@_);
47 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
48 push(@$pluginlist, $class);
49
50 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
51 push(@{$hashArgOptLists->{"OptList"}},$options);
52
53 my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
54
55 $self->{'convert_to'} = "Rog";
56 $self->{'convert_to_ext'} = "rog";
57
58 return bless $self, $class;
59}
60
61
62sub begin {
63 my $self = shift (@_);
64
65 $self->SUPER::begin(@_);
66
67 $self->{'docnum'} = 0;
68}
69
70sub end {
71 my ($self) = @_;
72
73 # nothing to do, but keep symmetric with begin function
74 $self->SUPER::end(@_);
75}
76
77
78# Run conversion utility on the input file.
79#
80# The conversion takes place in a collection specific 'tmp' directory so
81# that we don't accidentally damage the input.
82#
83# The desired output type is indicated by $output_ext. This is usually
84# something like "html" or "word", but can be "best" (or the empty string)
85# to indicate that the conversion utility should do the best it can.
86
87sub tmp_area_convert_file {
88 my $self = shift (@_);
89 my ($output_ext, $input_filename, $textref) = @_;
90
91 my $outhandle = $self->{'outhandle'};
92 my $convert_to = $self->{'convert_to'};
93 my $failhandle = $self->{'failhandle'};
94
95 # softlink to collection tmp dir
96 my $tmp_dirname
97 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
98 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
99
100 # derive tmp filename from input filename
101 my ($tailname, $dirname, $suffix)
102 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
103
104 # Remove any white space from filename -- no risk of name collision, and
105 # makes later conversion by utils simpler. Leave spaces in path...
106 $tailname =~ s/\s+//g;
107
108 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
109
110 &util::soft_link($input_filename, $tmp_filename);
111
112 my $verbosity = $self->{'verbosity'};
113 if ($verbosity > 0) {
114 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
115 }
116
117 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
118
119 # Execute the conversion command and get the type of the result,
120 # making sure the converter gives us the appropriate output type
121 my $output_type = lc($convert_to);
122 my $cmd = "perl -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
123 $output_type = `$cmd`;
124
125 # remove symbolic link to original file
126 &util::rm($tmp_filename);
127
128 # Check STDERR here
129 chomp $output_type;
130 if ($output_type eq "fail") {
131 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
132 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
133 $self->{'num_not_processed'} ++;
134 if (-s "$errlog") {
135 open(ERRLOG, "$errlog");
136 while (<ERRLOG>) {
137 print $outhandle "$_";
138 }
139 print $outhandle "\n";
140 close ERRLOG;
141 }
142 &util::rm("$errlog") if (-e "$errlog");
143 return "";
144 }
145
146 # store the *actual* output type and return the output filename
147 # it's possible we requested conversion to html, but only to text succeeded
148
149 $self->{'convert_to_ext'} = $output_type;
150 $self->{'converted_to'} = "Rog";
151
152 my $output_filename = $tmp_filename;
153
154 $output_filename =~ s/$suffix$//;
155
156 return $output_filename;
157}
158
159
160# Remove collection specific tmp directory and all its contents.
161
162sub cleanup_tmp_area {
163 my $self = shift (@_);
164
165 my $tmp_dirname
166 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
167 &util::rm_r($tmp_dirname);
168 &util::mk_dir($tmp_dirname);
169}
170
171
172# Exact copy of read_rog_record from RogPlugin
173# Needed for FILE in right scope
174
175sub read_rog_record
176{
177 my ($self,$file_buffer, $docnum, $seclevel) = @_;
178
179 my $next_line = $file_buffer->{'next_line'};
180
181 return 0 if (!defined $next_line);
182
183 if ($next_line eq "")
184 {
185 my $line;
186 while(defined($line=<FILE>))
187 {
188 $line =~ s/\r$//;
189 $file_buffer->{'line_no'}++;
190 next if ($line =~ m/^\#/);
191 $next_line = $line;
192 last;
193 }
194 }
195
196 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
197 {
198 print STDERR "Error: Malformed Rog file: $next_line";
199 return 0;
200 }
201 else
202 {
203 # init default values
204 $file_buffer->{'song'}->{'tempo'} = 120;
205 $file_buffer->{'song'}->{'ks_type'} = 0;
206 $file_buffer->{'song'}->{'ks_num'} = 0;
207 $file_buffer->{'song'}->{'metadata'} = [];
208 $file_buffer->{'song'}->{'content'} = "";
209
210 $file_buffer->{'song'}->{'subcol'} = $2;
211 $file_buffer->{'song'}->{'title'} = $4;
212 $file_buffer->{'song'}->{'tval'} = $6;
213
214 chomp($next_line);
215 my $content = $next_line;
216 if (defined $docnum)
217 {
218 $content.= " $docnum $seclevel";
219 }
220 $content .= "\n";
221
222 $file_buffer->{'song'}->{'content'} = $content;
223
224
225 my $line;
226 while(defined($line=<FILE>))
227 {
228 $line =~ s/\r$//;
229
230 $file_buffer->{'line_no'}++;
231 next if ($line =~ m/^\#/);
232
233 if ($line =~ m/^song/)
234 {
235 $file_buffer->{'next_line'} = $line;
236 return 1;
237 }
238 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
239 {
240 $file_buffer->{'song'}->{'tempo'} = $2;
241 $file_buffer->{'song'}->{'content'} .= $line;
242 }
243 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
244 {
245 $file_buffer->{'song'}->{'ks_type'} = $2;
246 $file_buffer->{'song'}->{'ks_num'} = $4;
247 $file_buffer->{'song'}->{'content'} .= $line;
248 }
249 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
250 {
251 $file_buffer->{'song'}->{'ts_numer'} = $2;
252 $file_buffer->{'song'}->{'ts_denom'} = $4;
253 $file_buffer->{'song'}->{'content'} .= $line;
254 }
255 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
256 {
257 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
258 $file_buffer->{'song'}->{'content'} .= $line;
259 }
260 else
261 {
262 $file_buffer->{'song'}->{'content'} .= $line;
263 }
264 }
265
266 $file_buffer->{'next_line'} = undef;
267 }
268
269 return 1;
270}
271
272# Override RogPlugin function so rog files are stored as sections (not docs)
273
274sub process_rog_record
275{
276 my ($self,$doc_obj,$cursection,$song) = @_;
277
278 $cursection =
279 $doc_obj->insert_section($cursection);
280 $self->{'docnum'}++;
281
282 my $title = $song->{'title'};
283 my $title_safe = $title;
284 $title_safe =~ s/\'/\\\\&apos;/g;
285
286 # add metadata
287 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
288 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
289 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
290 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
291 $doc_obj->add_metadata($cursection, "Title", $title);
292 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
293 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
294
295 foreach my $md ( @{$song->{'metadata'}} )
296 {
297 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
298 }
299
300 # add contents as text
301 $doc_obj->add_text($cursection,$song->{'content'});
302
303 return $cursection;
304}
305
306
307
308# Override BasePlugin read
309# We don't want to get language encoding stuff until after we've converted
310# our file to Rog format
311sub read {
312 my $self = shift (@_);
313 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
314
315 my $outhandle = $self->{'outhandle'};
316
317 # check process_exp, block_exp, associate_ext etc
318 my ($block_status,$filename) = $self->read_block(@_);
319 return $block_status if ((!defined $block_status) || ($block_status==0));
320
321 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
322
323 # read in file ($text will be in utf8)
324 my $text = "";
325
326 my $output_ext = $self->{'convert_to_ext'};
327 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
328
329 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
330 $self->{'conv_filename'} = $conv_filename;
331
332
333 # create a new document
334 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
335 # the original filename is used now
336 my $doc_obj = new doc ($filename, "indexed_doc");
337 # the converted filename is set separately
338 $doc_obj->set_converted_filename($conv_filename);
339
340 my $topsection = $doc_obj->get_top_section();
341 my $cursection = $topsection;
342
343 $self->{'docnum'}++;
344 my $docnum = $self->{'docnum'};
345
346 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
347 my ($filemeta) = $file =~ /([^\\\/]+)$/;
348 $self->set_Source_metadata($doc_obj, $filemeta);
349
350 if ($self->{'cover_image'}) {
351 $self->associate_cover_image($doc_obj, $filename);
352 }
353 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
354 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
355
356 my $track_no = "1";
357 my $rog_filename = "$conv_filename$track_no.$output_ext";
358 while (1)
359 {
360 last unless open (FILE, $rog_filename) ;
361
362 my $file_buffer = { line_no => 0, next_line => "", song => {} };
363
364 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
365 {
366 my $song = $file_buffer->{'song'};
367 my $content = $song->{'content'};
368 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
369
370 $cursection
371 = $self->process_rog_record($doc_obj,$cursection,
372 $file_buffer->{'song'});
373 }
374
375 close FILE;
376
377 $track_no++;
378 $rog_filename = "$conv_filename$track_no.$output_ext";
379 }
380
381 print STDERR "\n";
382
383 # include any metadata passed in from previous plugins
384 # note that this metadata is associated with the top level section
385 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
386 # do plugin specific processing of doc_obj
387 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
388 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
389 return -1;
390 }
391 # do any automatic metadata extraction
392 $self->auto_extract_metadata ($doc_obj);
393 # add an OID
394 $doc_obj->set_OID();
395
396 my $oid = $doc_obj->get_OID();
397 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
398
399 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
400 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
401 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
402
403 # if no title metadata defined, set it to filename minus extension
404 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
405 if (!defined $existing_title)
406 {
407 my $title = $doc_obj->get_metadata_element($topsection,"Source");
408 $title =~ s/\..*?$//g;
409 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
410
411 my $title_safe = $title;
412 $title_safe =~ s/\'/\\\\&apos;/g;
413 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
414 }
415
416 # process the document
417 $processor->process($doc_obj);
418 $self->cleanup_tmp_area();
419
420 $self->{'num_processed'} ++;
421
422 return 1;
423}
424
425
426# do plugin specific processing of doc_obj for HTML type
427sub process_type {
428 my $self = shift (@_);
429 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
430
431 my $conv_filename = $self->{'conv_filename'};
432 my $tmp_dirname = File::Basename::dirname($conv_filename);
433 my $tmp_tailname = File::Basename::basename($conv_filename);
434
435 my $converted_to = $self->{'converted_to'};
436 my $ret_val = 1;
437
438# $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
439# $tmp_dirname, $tmp_tailname,
440# $metadata, $doc_obj);
441
442 # associate original file with doc object
443 my $cursection = $doc_obj->get_top_section();
444 my $filename = &util::filename_cat($base_dir, $file);
445 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
446
447 my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">";
448 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
449 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
450 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
451 return $ret_val;
452}
453
4541;
Note: See TracBrowser for help on using the repository browser.