source: main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm@ 28563

Last change on this file since 28563 was 28563, checked in by kjdon, 10 years ago

changing some util:: methods to FileUtils:: methods

  • Property svn:keywords set to Author Date Id Revision
File size: 13.8 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33use util;
34use FileUtils;
35
36sub BEGIN {
37 @ConvertToRogPlugin::ISA = ('RogPlugin');
38}
39
40my $arguments = [
41 ];
42my $options = { 'name' => "ConvertToRogPlugin",
43 'desc' => "{ConvertToRogPlugin.desc}",
44 'abstract' => "yes",
45 'inherits' => "yes" };
46
47sub new {
48 my ($class) = shift (@_);
49 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
50 push(@$pluginlist, $class);
51
52 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
53 push(@{$hashArgOptLists->{"OptList"}},$options);
54
55 my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
56
57 $self->{'convert_to'} = "Rog";
58 $self->{'convert_to_ext'} = "rog";
59
60 return bless $self, $class;
61}
62
63
64sub begin {
65 my $self = shift (@_);
66
67 $self->SUPER::begin(@_);
68
69 $self->{'docnum'} = 0;
70}
71
72sub end {
73 my ($self) = @_;
74
75 # nothing to do, but keep symmetric with begin function
76 $self->SUPER::end(@_);
77}
78
79
80# Run conversion utility on the input file.
81#
82# The conversion takes place in a collection specific 'tmp' directory so
83# that we don't accidentally damage the input.
84#
85# The desired output type is indicated by $output_ext. This is usually
86# something like "html" or "word", but can be "best" (or the empty string)
87# to indicate that the conversion utility should do the best it can.
88
89sub tmp_area_convert_file {
90 my $self = shift (@_);
91 my ($output_ext, $input_filename, $textref) = @_;
92
93 my $outhandle = $self->{'outhandle'};
94 my $convert_to = $self->{'convert_to'};
95 my $failhandle = $self->{'failhandle'};
96
97 # softlink to collection tmp dir
98 my $tmp_dirname
99 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
100 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
101
102 # derive tmp filename from input filename
103 my ($tailname, $dirname, $suffix)
104 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
105
106 # Remove any white space from filename -- no risk of name collision, and
107 # makes later conversion by utils simpler. Leave spaces in path...
108 $tailname =~ s/\s+//g;
109 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
110
111 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
112
113 &FileUtils::softLink($input_filename, $tmp_filename);
114
115 my $verbosity = $self->{'verbosity'};
116 if ($verbosity > 0) {
117 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
118 }
119
120 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
121
122 # Execute the conversion command and get the type of the result,
123 # making sure the converter gives us the appropriate output type
124 my $output_type = lc($convert_to);
125 my $cmd = "\"".&util::get_perl_exec()."\" -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
126 $output_type = `$cmd`;
127
128 # remove symbolic link to original file
129 &FileUtils::removeFiles($tmp_filename);
130
131 # Check STDERR here
132 chomp $output_type;
133 if ($output_type eq "fail") {
134 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
135 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
136 $self->{'num_not_processed'} ++;
137 if (-s "$errlog") {
138 open(ERRLOG, "$errlog");
139 while (<ERRLOG>) {
140 print $outhandle "$_";
141 }
142 print $outhandle "\n";
143 close ERRLOG;
144 }
145 &FileUtils::removeFiles("$errlog") if (-e "$errlog");
146 return "";
147 }
148
149 # store the *actual* output type and return the output filename
150 # it's possible we requested conversion to html, but only to text succeeded
151
152 $self->{'convert_to_ext'} = $output_type;
153 $self->{'converted_to'} = "Rog";
154
155 my $output_filename = $tmp_filename;
156
157 $output_filename =~ s/$suffix$//;
158
159 return $output_filename;
160}
161
162
163# Remove collection specific tmp directory and all its contents.
164
165sub cleanup_tmp_area {
166 my $self = shift (@_);
167
168 my $tmp_dirname
169 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
170 &FileUtils::removeFilesRecursive($tmp_dirname);
171 &FileUtils::makeDirectory($tmp_dirname);
172}
173
174
175# Exact copy of read_rog_record from RogPlugin
176# Needed for FILE in right scope
177
178sub read_rog_record
179{
180 my ($self,$file_buffer, $docnum, $seclevel) = @_;
181
182 my $next_line = $file_buffer->{'next_line'};
183
184 return 0 if (!defined $next_line);
185
186 if ($next_line eq "")
187 {
188 my $line;
189 while(defined($line=<FILE>))
190 {
191 $line =~ s/\r$//;
192 $file_buffer->{'line_no'}++;
193 next if ($line =~ m/^\#/);
194 $next_line = $line;
195 last;
196 }
197 }
198
199 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
200 {
201 print STDERR "Error: Malformed Rog file: $next_line";
202 return 0;
203 }
204 else
205 {
206 # init default values
207 $file_buffer->{'song'}->{'tempo'} = 120;
208 $file_buffer->{'song'}->{'ks_type'} = 0;
209 $file_buffer->{'song'}->{'ks_num'} = 0;
210 $file_buffer->{'song'}->{'metadata'} = [];
211 $file_buffer->{'song'}->{'content'} = "";
212
213 $file_buffer->{'song'}->{'subcol'} = $2;
214 $file_buffer->{'song'}->{'title'} = $4;
215 $file_buffer->{'song'}->{'tval'} = $6;
216
217 chomp($next_line);
218 my $content = $next_line;
219 if (defined $docnum)
220 {
221 $content.= " $docnum $seclevel";
222 }
223 $content .= "\n";
224
225 $file_buffer->{'song'}->{'content'} = $content;
226
227
228 my $line;
229 while(defined($line=<FILE>))
230 {
231 $line =~ s/\r$//;
232
233 $file_buffer->{'line_no'}++;
234 next if ($line =~ m/^\#/);
235
236 if ($line =~ m/^song/)
237 {
238 $file_buffer->{'next_line'} = $line;
239 return 1;
240 }
241 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
242 {
243 $file_buffer->{'song'}->{'tempo'} = $2;
244 $file_buffer->{'song'}->{'content'} .= $line;
245 }
246 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
247 {
248 $file_buffer->{'song'}->{'ks_type'} = $2;
249 $file_buffer->{'song'}->{'ks_num'} = $4;
250 $file_buffer->{'song'}->{'content'} .= $line;
251 }
252 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
253 {
254 $file_buffer->{'song'}->{'ts_numer'} = $2;
255 $file_buffer->{'song'}->{'ts_denom'} = $4;
256 $file_buffer->{'song'}->{'content'} .= $line;
257 }
258 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
259 {
260 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
261 $file_buffer->{'song'}->{'content'} .= $line;
262 }
263 else
264 {
265 $file_buffer->{'song'}->{'content'} .= $line;
266 }
267 }
268
269 $file_buffer->{'next_line'} = undef;
270 }
271
272 return 1;
273}
274
275# Override RogPlugin function so rog files are stored as sections (not docs)
276
277sub process_rog_record
278{
279 my ($self,$doc_obj,$cursection,$song) = @_;
280
281 $cursection =
282 $doc_obj->insert_section($cursection);
283 $self->{'docnum'}++;
284
285 my $title = $song->{'title'};
286 my $title_safe = $title;
287 $title_safe =~ s/\'/\\\\&apos;/g;
288
289 # add metadata
290 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
291 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
292 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
293 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
294 $doc_obj->add_metadata($cursection, "Title", $title);
295 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
296 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
297
298 foreach my $md ( @{$song->{'metadata'}} )
299 {
300 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
301 }
302
303 # add contents as text
304 $doc_obj->add_text($cursection,$song->{'content'});
305
306 return $cursection;
307}
308
309
310
311# Override BasePlugin read
312# We don't want to get language encoding stuff until after we've converted
313# our file to Rog format
314sub read {
315 my $self = shift (@_);
316 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
317
318 my $outhandle = $self->{'outhandle'};
319
320 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
321 return undef unless $self->can_process_this_file($filename_full_path);
322
323 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
324
325 # read in file ($text will be in utf8)
326 my $text = "";
327
328 my $output_ext = $self->{'convert_to_ext'};
329 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
330
331 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
332 $self->{'conv_filename'} = $conv_filename;
333
334
335 # create a new document
336 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
337 # the original filename is used now
338 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
339 # the converted filename is set separately
340 $doc_obj->set_converted_filename($conv_filename);
341
342 my $topsection = $doc_obj->get_top_section();
343 my $cursection = $topsection;
344
345 $self->{'docnum'}++;
346 my $docnum = $self->{'docnum'};
347
348 my ($filemeta) = $file =~ /([^\\\/]+)$/;
349 my $plugin_filename_encoding = $self->{'filename_encoding'};
350 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
351 $self->set_Source_metadata($doc_obj, $conv_filename, $filename_encoding);
352
353 if ($self->{'cover_image'}) {
354 $self->associate_cover_image($doc_obj, $filename_full_path);
355 }
356 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
357 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
358
359 my $track_no = "1";
360 my $rog_filename = "$conv_filename$track_no.$output_ext";
361 while (1)
362 {
363 last unless open (FILE, $rog_filename) ;
364
365 my $file_buffer = { line_no => 0, next_line => "", song => {} };
366
367 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
368 {
369 my $song = $file_buffer->{'song'};
370 my $content = $song->{'content'};
371 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
372
373 $cursection
374 = $self->process_rog_record($doc_obj,$cursection,
375 $file_buffer->{'song'});
376 }
377
378 close FILE;
379
380 $track_no++;
381 $rog_filename = "$conv_filename$track_no.$output_ext";
382 }
383
384 print STDERR "\n";
385
386 # include any metadata passed in from previous plugins
387 # note that this metadata is associated with the top level section
388 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
389 # do plugin specific processing of doc_obj
390 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
391 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
392 return -1;
393 }
394 # do any automatic metadata extraction
395 $self->auto_extract_metadata ($doc_obj);
396 # add an OID
397 $self->add_OID($doc_obj);
398
399 my $oid = $doc_obj->get_OID();
400 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
401
402 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
403 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
404 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
405
406 # if no title metadata defined, set it to filename minus extension
407 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
408 if (!defined $existing_title)
409 {
410 my $title = $doc_obj->get_metadata_element($topsection,"Source");
411 $title =~ s/\..*?$//g;
412 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
413
414 my $title_safe = $title;
415 $title_safe =~ s/\'/\\\\&apos;/g;
416 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
417 }
418
419 # process the document
420 $processor->process($doc_obj);
421 $self->cleanup_tmp_area();
422
423 $self->{'num_processed'} ++;
424
425 return 1;
426}
427
428
429# do plugin specific processing of doc_obj for HTML type
430sub process_type {
431 my $self = shift (@_);
432 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
433
434 my $conv_filename = $self->{'conv_filename'};
435 my $tmp_dirname = File::Basename::dirname($conv_filename);
436 my $tmp_tailname = File::Basename::basename($conv_filename);
437
438 my $converted_to = $self->{'converted_to'};
439 my $ret_val = 1;
440
441# $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
442# $tmp_dirname, $tmp_tailname,
443# $metadata, $doc_obj);
444
445 # associate original file with doc object
446 my $cursection = $doc_obj->get_top_section();
447 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
448 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
449
450 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
451 $doc_obj->add_metadata ($cursection, "srclink_file", "doc.$doc_ext");
452 $doc_obj->add_metadata ($cursection, "srclinkFile", "doc.$doc_ext");
453 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
454
455 return $ret_val;
456}
457
4581;
Note: See TracBrowser for help on using the repository browser.