source: trunk/gsdl/perllib/plugins/ConvertToRogPlug.pm@ 11090

Last change on this file since 11090 was 11090, checked in by kjdon, 18 years ago

made all plugins that implement read() call read_block to check process_exp, block_exp, smart blocking, cover image blocking etc

  • Property svn:keywords set to Author Date Id Revision
File size: 13.4 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlug.pm -- plugin that inherits from RogPlug
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlug;
29
30use BasPlug;
31use RogPlug;
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35sub BEGIN {
36 @ConvertToRogPlug::ISA = ('RogPlug');
37}
38
39my $arguments = [
40 ];
41my $options = { 'name' => "ConvertToRogPlug",
42 'desc' => "{ConvertToRogPlug.desc}",
43 'abstract' => "yes",
44 'inherits' => "yes" };
45
46sub new {
47 my ($class) = shift (@_);
48 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
49 push(@$pluginlist, $class);
50
51 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
52 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
53
54 my $self = (defined $hashArgOptLists)? new RogPlug($pluginlist,$inputargs,$hashArgOptLists): new RogPlug($pluginlist,$inputargs);
55
56 $self->{'convert_to'} = "Rog";
57 $self->{'convert_to_ext'} = "rog";
58
59 return bless $self, $class;
60}
61
62
63sub begin {
64 my $self = shift (@_);
65
66 $self->SUPER::begin(@_);
67
68 $self->{'docnum'} = 0;
69}
70
71sub end {
72 my ($self) = @_;
73
74 # nothing to do, but keep symmetric with begin function
75 $self->SUPER::end(@_);
76}
77
78
79# Run conversion utility on the input file.
80#
81# The conversion takes place in a collection specific 'tmp' directory so
82# that we don't accidentally damage the input.
83#
84# The desired output type is indicated by $output_ext. This is usually
85# something like "html" or "word", but can be "best" (or the empty string)
86# to indicate that the conversion utility should do the best it can.
87
88sub tmp_area_convert_file {
89 my $self = shift (@_);
90 my ($output_ext, $input_filename, $textref) = @_;
91
92 my $outhandle = $self->{'outhandle'};
93 my $convert_to = $self->{'convert_to'};
94 my $failhandle = $self->{'failhandle'};
95
96 # softlink to collection tmp dir
97 my $tmp_dirname
98 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
99 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
100
101 # derive tmp filename from input filename
102 my ($tailname, $dirname, $suffix)
103 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
104
105 # Remove any white space from filename -- no risk of name collision, and
106 # makes later conversion by utils simpler. Leave spaces in path...
107 $tailname =~ s/\s+//g;
108
109 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
110
111 &util::soft_link($input_filename, $tmp_filename);
112
113 my $verbosity = $self->{'verbosity'};
114 if ($verbosity > 0) {
115 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
116 }
117
118 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
119
120 # Execute the conversion command and get the type of the result,
121 # making sure the converter gives us the appropriate output type
122 my $output_type = lc($convert_to);
123 my $cmd = "perl -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
124 $output_type = `$cmd`;
125
126 # remove symbolic link to original file
127 &util::rm($tmp_filename);
128
129 # Check STDERR here
130 chomp $output_type;
131 if ($output_type eq "fail") {
132 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
133 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
134 $self->{'num_not_processed'} ++;
135 if (-s "$errlog") {
136 open(ERRLOG, "$errlog");
137 while (<ERRLOG>) {
138 print $outhandle "$_";
139 }
140 print $outhandle "\n";
141 close ERRLOG;
142 }
143 &util::rm("$errlog") if (-e "$errlog");
144 return "";
145 }
146
147 # store the *actual* output type and return the output filename
148 # it's possible we requested conversion to html, but only to text succeeded
149
150 $self->{'convert_to_ext'} = $output_type;
151 $self->{'converted_to'} = "Rog";
152
153 my $output_filename = $tmp_filename;
154
155 $output_filename =~ s/$suffix$//;
156
157 return $output_filename;
158}
159
160
161# Remove collection specific tmp directory and all its contents.
162
163sub cleanup_tmp_area {
164 my $self = shift (@_);
165
166 my $tmp_dirname
167 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
168 &util::rm_r($tmp_dirname);
169 &util::mk_dir($tmp_dirname);
170}
171
172
173# Exact copy of read_rog_record from RogPlug
174# Needed for FILE in right scope
175
176sub read_rog_record
177{
178 my ($self,$file_buffer, $docnum, $seclevel) = @_;
179
180 my $next_line = $file_buffer->{'next_line'};
181
182 return 0 if (!defined $next_line);
183
184 if ($next_line eq "")
185 {
186 my $line;
187 while(defined($line=<FILE>))
188 {
189 $line =~ s/\r$//;
190 $file_buffer->{'line_no'}++;
191 next if ($line =~ m/^\#/);
192 $next_line = $line;
193 last;
194 }
195 }
196
197 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
198 {
199 print STDERR "Error: Malformed Rog file: $next_line";
200 return 0;
201 }
202 else
203 {
204 # init default values
205 $file_buffer->{'song'}->{'tempo'} = 120;
206 $file_buffer->{'song'}->{'ks_type'} = 0;
207 $file_buffer->{'song'}->{'ks_num'} = 0;
208 $file_buffer->{'song'}->{'metadata'} = [];
209 $file_buffer->{'song'}->{'content'} = "";
210
211 $file_buffer->{'song'}->{'subcol'} = $2;
212 $file_buffer->{'song'}->{'title'} = $4;
213 $file_buffer->{'song'}->{'tval'} = $6;
214
215 chomp($next_line);
216 my $content = $next_line;
217 if (defined $docnum)
218 {
219 $content.= " $docnum $seclevel";
220 }
221 $content .= "\n";
222
223 $file_buffer->{'song'}->{'content'} = $content;
224
225
226 my $line;
227 while(defined($line=<FILE>))
228 {
229 $line =~ s/\r$//;
230
231 $file_buffer->{'line_no'}++;
232 next if ($line =~ m/^\#/);
233
234 if ($line =~ m/^song/)
235 {
236 $file_buffer->{'next_line'} = $line;
237 return 1;
238 }
239 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
240 {
241 $file_buffer->{'song'}->{'tempo'} = $2;
242 $file_buffer->{'song'}->{'content'} .= $line;
243 }
244 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
245 {
246 $file_buffer->{'song'}->{'ks_type'} = $2;
247 $file_buffer->{'song'}->{'ks_num'} = $4;
248 $file_buffer->{'song'}->{'content'} .= $line;
249 }
250 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
251 {
252 $file_buffer->{'song'}->{'ts_numer'} = $2;
253 $file_buffer->{'song'}->{'ts_denom'} = $4;
254 $file_buffer->{'song'}->{'content'} .= $line;
255 }
256 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
257 {
258 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
259 $file_buffer->{'song'}->{'content'} .= $line;
260 }
261 else
262 {
263 $file_buffer->{'song'}->{'content'} .= $line;
264 }
265 }
266
267 $file_buffer->{'next_line'} = undef;
268 }
269
270 return 1;
271}
272
273# Override RogPlug function so rog files are stored as sections (not docs)
274
275sub process_rog_record
276{
277 my ($self,$doc_obj,$cursection,$song) = @_;
278
279 $cursection =
280 $doc_obj->insert_section($cursection);
281 $self->{'docnum'}++;
282
283 my $title = $song->{'title'};
284 my $title_safe = $title;
285 $title_safe =~ s/\'/\\\\&apos;/g;
286
287 # add metadata
288 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
289 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
290 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
291 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
292 $doc_obj->add_metadata($cursection, "Title", $title);
293 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
294 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
295
296 foreach my $md ( @{$song->{'metadata'}} )
297 {
298 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
299 }
300
301 # add contents as text
302 $doc_obj->add_text($cursection,$song->{'content'});
303
304 return $cursection;
305}
306
307
308
309# Override BasPlug read
310# We don't want to get language encoding stuff until after we've converted
311# our file to Rog format
312sub read {
313 my $self = shift (@_);
314 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
315
316 my $outhandle = $self->{'outhandle'};
317
318 # check process_exp, block_exp, associate_ext etc
319 my ($block_status,$filename) = $self->read_block(@_);
320 return $block_status if ((!defined $block_status) || ($block_status==0));
321
322 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
323
324 # read in file ($text will be in utf8)
325 my $text = "";
326
327 my $output_ext = $self->{'convert_to_ext'};
328 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
329
330 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
331 $self->{'conv_filename'} = $conv_filename;
332
333
334 # create a new document
335 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
336 # the original filename is used now
337 my $doc_obj = new doc ($filename, "indexed_doc");
338 # the converted filename is set separately
339 $doc_obj->set_converted_filename($conv_filename);
340
341 my $topsection = $doc_obj->get_top_section();
342 my $cursection = $topsection;
343
344 $self->{'docnum'}++;
345 my $docnum = $self->{'docnum'};
346
347 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
348 my ($filemeta) = $file =~ /([^\\\/]+)$/;
349 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
350 if ($self->{'cover_image'}) {
351 $self->associate_cover_image($doc_obj, $filename);
352 }
353 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
354 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
355
356 my $track_no = "1";
357 my $rog_filename = "$conv_filename$track_no.$output_ext";
358 while (1)
359 {
360 last unless open (FILE, $rog_filename) ;
361
362 my $file_buffer = { line_no => 0, next_line => "", song => {} };
363
364 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
365 {
366 my $song = $file_buffer->{'song'};
367 my $content = $song->{'content'};
368 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
369
370 $cursection
371 = $self->process_rog_record($doc_obj,$cursection,
372 $file_buffer->{'song'});
373 }
374
375 close FILE;
376
377 $track_no++;
378 $rog_filename = "$conv_filename$track_no.$output_ext";
379 }
380
381 print STDERR "\n";
382
383 # include any metadata passed in from previous plugins
384 # note that this metadata is associated with the top level section
385 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
386 # do plugin specific processing of doc_obj
387 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
388 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
389 return -1;
390 }
391 # do any automatic metadata extraction
392 $self->auto_extract_metadata ($doc_obj);
393 # add an OID
394 $doc_obj->set_OID();
395
396 my $oid = $doc_obj->get_OID();
397 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
398
399 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
400 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
401 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
402
403 # if no title metadata defined, set it to filename minus extension
404 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
405 if (!defined $existing_title)
406 {
407 my $title = $doc_obj->get_metadata_element($topsection,"Source");
408 $title =~ s/\..*?$//g;
409 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
410
411 my $title_safe = $title;
412 $title_safe =~ s/\'/\\\\&apos;/g;
413 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
414 }
415
416 # process the document
417 $processor->process($doc_obj);
418 $self->cleanup_tmp_area();
419
420 $self->{'num_processed'} ++;
421
422 return 1;
423}
424
425
426# do plugin specific processing of doc_obj for HTML type
427sub process_type {
428 my $self = shift (@_);
429 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
430
431 my $conv_filename = $self->{'conv_filename'};
432 my $tmp_dirname = File::Basename::dirname($conv_filename);
433 my $tmp_tailname = File::Basename::basename($conv_filename);
434
435 my $converted_to = $self->{'converted_to'};
436 my $ret_val = 1;
437
438# $ret_val = &RogPlug::process($self, $textref, $pluginfo,
439# $tmp_dirname, $tmp_tailname,
440# $metadata, $doc_obj);
441
442 # associate original file with doc object
443 my $cursection = $doc_obj->get_top_section();
444 my $filename = &util::filename_cat($base_dir, $file);
445 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
446
447 my $doclink = "<a href=\"_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext\">";
448 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
449 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
450 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
451 return $ret_val;
452}
453
4541;
Note: See TracBrowser for help on using the repository browser.