source: gsdl/trunk/perllib/plugins/ConvertToRogPlugin.pm@ 16392

Last change on this file since 16392 was 16392, checked in by kjdon, 16 years ago

global block pass: read_block is no more, use can_process_this_file to see whether a file is for us or not. extra arg (block_hash) to read, read_into_doc_obj, metadata_read etc

  • Property svn:keywords set to Author Date Id Revision
File size: 13.4 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 @ConvertToRogPlugin::ISA = ('RogPlugin');
36}
37
38my $arguments = [
39 ];
40my $options = { 'name' => "ConvertToRogPlugin",
41 'desc' => "{ConvertToRogPlugin.desc}",
42 'abstract' => "yes",
43 'inherits' => "yes" };
44
45sub new {
46 my ($class) = shift (@_);
47 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
48 push(@$pluginlist, $class);
49
50 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
51 push(@{$hashArgOptLists->{"OptList"}},$options);
52
53 my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
54
55 $self->{'convert_to'} = "Rog";
56 $self->{'convert_to_ext'} = "rog";
57
58 return bless $self, $class;
59}
60
61
62sub begin {
63 my $self = shift (@_);
64
65 $self->SUPER::begin(@_);
66
67 $self->{'docnum'} = 0;
68}
69
70sub end {
71 my ($self) = @_;
72
73 # nothing to do, but keep symmetric with begin function
74 $self->SUPER::end(@_);
75}
76
77
78# Run conversion utility on the input file.
79#
80# The conversion takes place in a collection specific 'tmp' directory so
81# that we don't accidentally damage the input.
82#
83# The desired output type is indicated by $output_ext. This is usually
84# something like "html" or "word", but can be "best" (or the empty string)
85# to indicate that the conversion utility should do the best it can.
86
87sub tmp_area_convert_file {
88 my $self = shift (@_);
89 my ($output_ext, $input_filename, $textref) = @_;
90
91 my $outhandle = $self->{'outhandle'};
92 my $convert_to = $self->{'convert_to'};
93 my $failhandle = $self->{'failhandle'};
94
95 # softlink to collection tmp dir
96 my $tmp_dirname
97 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
98 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
99
100 # derive tmp filename from input filename
101 my ($tailname, $dirname, $suffix)
102 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
103
104 # Remove any white space from filename -- no risk of name collision, and
105 # makes later conversion by utils simpler. Leave spaces in path...
106 $tailname =~ s/\s+//g;
107
108 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
109
110 &util::soft_link($input_filename, $tmp_filename);
111
112 my $verbosity = $self->{'verbosity'};
113 if ($verbosity > 0) {
114 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
115 }
116
117 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
118
119 # Execute the conversion command and get the type of the result,
120 # making sure the converter gives us the appropriate output type
121 my $output_type = lc($convert_to);
122 my $cmd = "perl -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
123 $output_type = `$cmd`;
124
125 # remove symbolic link to original file
126 &util::rm($tmp_filename);
127
128 # Check STDERR here
129 chomp $output_type;
130 if ($output_type eq "fail") {
131 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
132 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
133 $self->{'num_not_processed'} ++;
134 if (-s "$errlog") {
135 open(ERRLOG, "$errlog");
136 while (<ERRLOG>) {
137 print $outhandle "$_";
138 }
139 print $outhandle "\n";
140 close ERRLOG;
141 }
142 &util::rm("$errlog") if (-e "$errlog");
143 return "";
144 }
145
146 # store the *actual* output type and return the output filename
147 # it's possible we requested conversion to html, but only to text succeeded
148
149 $self->{'convert_to_ext'} = $output_type;
150 $self->{'converted_to'} = "Rog";
151
152 my $output_filename = $tmp_filename;
153
154 $output_filename =~ s/$suffix$//;
155
156 return $output_filename;
157}
158
159
160# Remove collection specific tmp directory and all its contents.
161
162sub cleanup_tmp_area {
163 my $self = shift (@_);
164
165 my $tmp_dirname
166 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
167 &util::rm_r($tmp_dirname);
168 &util::mk_dir($tmp_dirname);
169}
170
171
172# Exact copy of read_rog_record from RogPlugin
173# Needed for FILE in right scope
174
175sub read_rog_record
176{
177 my ($self,$file_buffer, $docnum, $seclevel) = @_;
178
179 my $next_line = $file_buffer->{'next_line'};
180
181 return 0 if (!defined $next_line);
182
183 if ($next_line eq "")
184 {
185 my $line;
186 while(defined($line=<FILE>))
187 {
188 $line =~ s/\r$//;
189 $file_buffer->{'line_no'}++;
190 next if ($line =~ m/^\#/);
191 $next_line = $line;
192 last;
193 }
194 }
195
196 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
197 {
198 print STDERR "Error: Malformed Rog file: $next_line";
199 return 0;
200 }
201 else
202 {
203 # init default values
204 $file_buffer->{'song'}->{'tempo'} = 120;
205 $file_buffer->{'song'}->{'ks_type'} = 0;
206 $file_buffer->{'song'}->{'ks_num'} = 0;
207 $file_buffer->{'song'}->{'metadata'} = [];
208 $file_buffer->{'song'}->{'content'} = "";
209
210 $file_buffer->{'song'}->{'subcol'} = $2;
211 $file_buffer->{'song'}->{'title'} = $4;
212 $file_buffer->{'song'}->{'tval'} = $6;
213
214 chomp($next_line);
215 my $content = $next_line;
216 if (defined $docnum)
217 {
218 $content.= " $docnum $seclevel";
219 }
220 $content .= "\n";
221
222 $file_buffer->{'song'}->{'content'} = $content;
223
224
225 my $line;
226 while(defined($line=<FILE>))
227 {
228 $line =~ s/\r$//;
229
230 $file_buffer->{'line_no'}++;
231 next if ($line =~ m/^\#/);
232
233 if ($line =~ m/^song/)
234 {
235 $file_buffer->{'next_line'} = $line;
236 return 1;
237 }
238 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
239 {
240 $file_buffer->{'song'}->{'tempo'} = $2;
241 $file_buffer->{'song'}->{'content'} .= $line;
242 }
243 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
244 {
245 $file_buffer->{'song'}->{'ks_type'} = $2;
246 $file_buffer->{'song'}->{'ks_num'} = $4;
247 $file_buffer->{'song'}->{'content'} .= $line;
248 }
249 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
250 {
251 $file_buffer->{'song'}->{'ts_numer'} = $2;
252 $file_buffer->{'song'}->{'ts_denom'} = $4;
253 $file_buffer->{'song'}->{'content'} .= $line;
254 }
255 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
256 {
257 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
258 $file_buffer->{'song'}->{'content'} .= $line;
259 }
260 else
261 {
262 $file_buffer->{'song'}->{'content'} .= $line;
263 }
264 }
265
266 $file_buffer->{'next_line'} = undef;
267 }
268
269 return 1;
270}
271
272# Override RogPlugin function so rog files are stored as sections (not docs)
273
274sub process_rog_record
275{
276 my ($self,$doc_obj,$cursection,$song) = @_;
277
278 $cursection =
279 $doc_obj->insert_section($cursection);
280 $self->{'docnum'}++;
281
282 my $title = $song->{'title'};
283 my $title_safe = $title;
284 $title_safe =~ s/\'/\\\\&apos;/g;
285
286 # add metadata
287 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
288 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
289 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
290 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
291 $doc_obj->add_metadata($cursection, "Title", $title);
292 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
293 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
294
295 foreach my $md ( @{$song->{'metadata'}} )
296 {
297 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
298 }
299
300 # add contents as text
301 $doc_obj->add_text($cursection,$song->{'content'});
302
303 return $cursection;
304}
305
306
307
308# Override BasePlugin read
309# We don't want to get language encoding stuff until after we've converted
310# our file to Rog format
311sub read {
312 my $self = shift (@_);
313 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
314
315 my $outhandle = $self->{'outhandle'};
316
317 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
318 return undef unless $self->can_process_this_file($filename_full_path);
319
320 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
321
322 # read in file ($text will be in utf8)
323 my $text = "";
324
325 my $output_ext = $self->{'convert_to_ext'};
326 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
327
328 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
329 $self->{'conv_filename'} = $conv_filename;
330
331
332 # create a new document
333 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
334 # the original filename is used now
335 my $doc_obj = new doc ($filename_full_path, "indexed_doc");
336 # the converted filename is set separately
337 $doc_obj->set_converted_filename($conv_filename);
338
339 my $topsection = $doc_obj->get_top_section();
340 my $cursection = $topsection;
341
342 $self->{'docnum'}++;
343 my $docnum = $self->{'docnum'};
344
345 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
346 my ($filemeta) = $file =~ /([^\\\/]+)$/;
347 $self->set_Source_metadata($doc_obj, $filemeta);
348
349 if ($self->{'cover_image'}) {
350 $self->associate_cover_image($doc_obj, $filename_full_path);
351 }
352 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
353 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
354
355 my $track_no = "1";
356 my $rog_filename = "$conv_filename$track_no.$output_ext";
357 while (1)
358 {
359 last unless open (FILE, $rog_filename) ;
360
361 my $file_buffer = { line_no => 0, next_line => "", song => {} };
362
363 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
364 {
365 my $song = $file_buffer->{'song'};
366 my $content = $song->{'content'};
367 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
368
369 $cursection
370 = $self->process_rog_record($doc_obj,$cursection,
371 $file_buffer->{'song'});
372 }
373
374 close FILE;
375
376 $track_no++;
377 $rog_filename = "$conv_filename$track_no.$output_ext";
378 }
379
380 print STDERR "\n";
381
382 # include any metadata passed in from previous plugins
383 # note that this metadata is associated with the top level section
384 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
385 # do plugin specific processing of doc_obj
386 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
387 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
388 return -1;
389 }
390 # do any automatic metadata extraction
391 $self->auto_extract_metadata ($doc_obj);
392 # add an OID
393 $doc_obj->set_OID();
394
395 my $oid = $doc_obj->get_OID();
396 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
397
398 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
399 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
400 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
401
402 # if no title metadata defined, set it to filename minus extension
403 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
404 if (!defined $existing_title)
405 {
406 my $title = $doc_obj->get_metadata_element($topsection,"Source");
407 $title =~ s/\..*?$//g;
408 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
409
410 my $title_safe = $title;
411 $title_safe =~ s/\'/\\\\&apos;/g;
412 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
413 }
414
415 # process the document
416 $processor->process($doc_obj);
417 $self->cleanup_tmp_area();
418
419 $self->{'num_processed'} ++;
420
421 return 1;
422}
423
424
425# do plugin specific processing of doc_obj for HTML type
426sub process_type {
427 my $self = shift (@_);
428 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
429
430 my $conv_filename = $self->{'conv_filename'};
431 my $tmp_dirname = File::Basename::dirname($conv_filename);
432 my $tmp_tailname = File::Basename::basename($conv_filename);
433
434 my $converted_to = $self->{'converted_to'};
435 my $ret_val = 1;
436
437# $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
438# $tmp_dirname, $tmp_tailname,
439# $metadata, $doc_obj);
440
441 # associate original file with doc object
442 my $cursection = $doc_obj->get_top_section();
443 my $filename = &util::filename_cat($base_dir, $file);
444 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
445
446 my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">";
447 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
448 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
449 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
450 return $ret_val;
451}
452
4531;
Note: See TracBrowser for help on using the repository browser.