source: main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm

Last change on this file was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 13.8 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33use util;
34use FileUtils;
35
36sub BEGIN {
37 @ConvertToRogPlugin::ISA = ('RogPlugin');
38}
39
40my $arguments = [
41 ];
42my $options = { 'name' => "ConvertToRogPlugin",
43 'desc' => "{ConvertToRogPlugin.desc}",
44 'abstract' => "yes",
45 'inherits' => "yes" };
46
47sub new {
48 my ($class) = shift (@_);
49 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
50 push(@$pluginlist, $class);
51
52 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
53 push(@{$hashArgOptLists->{"OptList"}},$options);
54
55 my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
56
57 $self->{'convert_to'} = "Rog";
58 $self->{'convert_to_ext'} = "rog";
59
60 return bless $self, $class;
61}
62
63
64sub begin {
65 my $self = shift (@_);
66
67 $self->SUPER::begin(@_);
68
69 $self->{'docnum'} = 0;
70}
71
72sub end {
73 my ($self) = @_;
74
75 # nothing to do, but keep symmetric with begin function
76 $self->SUPER::end(@_);
77}
78
79
80# Run conversion utility on the input file.
81#
82# The conversion takes place in a collection specific 'tmp' directory so
83# that we don't accidentally damage the input.
84#
85# The desired output type is indicated by $output_ext. This is usually
86# something like "html" or "word", but can be "best" (or the empty string)
87# to indicate that the conversion utility should do the best it can.
88
89sub tmp_area_convert_file {
90 my $self = shift (@_);
91 my ($output_ext, $input_filename, $textref) = @_;
92
93 my $outhandle = $self->{'outhandle'};
94 my $convert_to = $self->{'convert_to'};
95 my $failhandle = $self->{'failhandle'};
96
97 # softlink to collection tmp dir
98 my $tmp_dirname
99 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
100 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
101
102 # derive tmp filename from input filename
103 my ($tailname, $dirname, $suffix)
104 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
105
106 # Remove any white space from filename -- no risk of name collision, and
107 # makes later conversion by utils simpler. Leave spaces in path...
108 $tailname =~ s/\s+//g;
109 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
110
111 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
112
113 &FileUtils::softLink($input_filename, $tmp_filename);
114
115 my $verbosity = $self->{'verbosity'};
116 if ($verbosity > 0) {
117 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
118 }
119
120 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
121
122 # Execute the conversion command and get the type of the result,
123 # making sure the converter gives us the appropriate output type
124 my $output_type = lc($convert_to);
125 my $cmd = "\"".&util::get_perl_exec()."\" -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
126 $output_type = `$cmd`;
127
128 # remove symbolic link to original file
129 &FileUtils::removeFiles($tmp_filename);
130
131 # Check STDERR here
132 chomp $output_type;
133 if ($output_type eq "fail") {
134 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
135 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
136 $self->{'num_not_processed'} ++;
137 if (-s "$errlog") {
138 open(ERRLOG, "$errlog");
139 while (<ERRLOG>) {
140 print $outhandle "$_";
141 }
142 print $outhandle "\n";
143 close ERRLOG;
144 }
145 &FileUtils::removeFiles("$errlog") if (-e "$errlog");
146 return "";
147 }
148
149 # store the *actual* output type and return the output filename
150 # it's possible we requested conversion to html, but only to text succeeded
151
152 $self->{'convert_to_ext'} = $output_type;
153 $self->{'converted_to'} = "Rog";
154
155 my $output_filename = $tmp_filename;
156
157 $output_filename =~ s/$suffix$//;
158
159 return $output_filename;
160}
161
162
163# Remove collection specific tmp directory and all its contents.
164
165sub cleanup_tmp_area {
166 my $self = shift (@_);
167
168 my $tmp_dirname
169 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
170 &FileUtils::removeFilesRecursive($tmp_dirname);
171 &FileUtils::makeDirectory($tmp_dirname);
172}
173
174
175# Exact copy of read_rog_record from RogPlugin
176# Needed for FILE in right scope
177
178sub read_rog_record
179{
180 my ($self,$file_buffer, $docnum, $seclevel) = @_;
181
182 my $next_line = $file_buffer->{'next_line'};
183
184 return 0 if (!defined $next_line);
185
186 if ($next_line eq "")
187 {
188 my $line;
189 while(defined($line=<FILE>))
190 {
191 $line =~ s/\r$//;
192 $file_buffer->{'line_no'}++;
193 next if ($line =~ m/^\#/);
194 $next_line = $line;
195 last;
196 }
197 }
198
199 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
200 {
201 print STDERR "Error: Malformed Rog file: $next_line";
202 return 0;
203 }
204 else
205 {
206 # init default values
207 $file_buffer->{'song'}->{'tempo'} = 120;
208 $file_buffer->{'song'}->{'ks_type'} = 0;
209 $file_buffer->{'song'}->{'ks_num'} = 0;
210 $file_buffer->{'song'}->{'metadata'} = [];
211 $file_buffer->{'song'}->{'content'} = "";
212
213 $file_buffer->{'song'}->{'subcol'} = $2;
214 $file_buffer->{'song'}->{'title'} = $4;
215 $file_buffer->{'song'}->{'tval'} = $6;
216
217 chomp($next_line);
218 my $content = $next_line;
219 if (defined $docnum)
220 {
221 $content.= " $docnum $seclevel";
222 }
223 $content .= "\n";
224
225 $file_buffer->{'song'}->{'content'} = $content;
226
227
228 my $line;
229 while(defined($line=<FILE>))
230 {
231 $line =~ s/\r$//;
232
233 $file_buffer->{'line_no'}++;
234 next if ($line =~ m/^\#/);
235
236 if ($line =~ m/^song/)
237 {
238 $file_buffer->{'next_line'} = $line;
239 return 1;
240 }
241 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
242 {
243 $file_buffer->{'song'}->{'tempo'} = $2;
244 $file_buffer->{'song'}->{'content'} .= $line;
245 }
246 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
247 {
248 $file_buffer->{'song'}->{'ks_type'} = $2;
249 $file_buffer->{'song'}->{'ks_num'} = $4;
250 $file_buffer->{'song'}->{'content'} .= $line;
251 }
252 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
253 {
254 $file_buffer->{'song'}->{'ts_numer'} = $2;
255 $file_buffer->{'song'}->{'ts_denom'} = $4;
256 $file_buffer->{'song'}->{'content'} .= $line;
257 }
258 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
259 {
260 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
261 $file_buffer->{'song'}->{'content'} .= $line;
262 }
263 else
264 {
265 $file_buffer->{'song'}->{'content'} .= $line;
266 }
267 }
268
269 $file_buffer->{'next_line'} = undef;
270 }
271
272 return 1;
273}
274
275# Override RogPlugin function so rog files are stored as sections (not docs)
276
277sub process_rog_record
278{
279 my ($self,$doc_obj,$cursection,$song) = @_;
280
281 $cursection =
282 $doc_obj->insert_section($cursection);
283 $self->{'docnum'}++;
284
285 my $title = $song->{'title'};
286 my $title_safe = $title;
287 $title_safe =~ s/\'/\\\\&apos;/g;
288
289 # add metadata
290 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
291 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
292 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
293 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
294 $doc_obj->add_metadata($cursection, "Title", $title);
295 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
296 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
297
298 foreach my $md ( @{$song->{'metadata'}} )
299 {
300 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
301 }
302
303 # add contents as text
304 $doc_obj->add_text($cursection,$song->{'content'});
305
306 return $cursection;
307}
308
309
310
311# Override BaseImporter read
312# We don't want to get language encoding stuff until after we've converted
313# our file to Rog format
314sub read {
315 my $self = shift (@_);
316 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
317
318 my $outhandle = $self->{'outhandle'};
319
320 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
321 return undef unless $self->can_process_this_file($filename_full_path);
322
323 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
324
325 # read in file ($text will be in utf8)
326 my $text = "";
327
328 my $output_ext = $self->{'convert_to_ext'};
329 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
330
331 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
332 $self->{'conv_filename'} = $conv_filename;
333
334
335 # create a new document
336 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
337 # the original filename is used now
338 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
339 # the converted filename is set separately
340 $doc_obj->set_converted_filename($conv_filename);
341
342 my $topsection = $doc_obj->get_top_section();
343 my $cursection = $topsection;
344
345 $self->{'docnum'}++;
346 my $docnum = $self->{'docnum'};
347
348 my ($filemeta) = $file =~ /([^\\\/]+)$/;
349 my $plugin_filename_encoding = $self->{'filename_encoding'};
350 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
351 $self->set_Source_metadata($doc_obj, $conv_filename, $filename_encoding);
352
353 if ($self->{'cover_image'}) {
354 $self->associate_cover_image($doc_obj, $filename_full_path);
355 }
356 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
357 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
358
359 my $track_no = "1";
360 my $rog_filename = "$conv_filename$track_no.$output_ext";
361 while (1)
362 {
363 last unless open (FILE, $rog_filename) ;
364
365 my $file_buffer = { line_no => 0, next_line => "", song => {} };
366
367 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
368 {
369 my $song = $file_buffer->{'song'};
370 my $content = $song->{'content'};
371 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
372
373 $cursection
374 = $self->process_rog_record($doc_obj,$cursection,
375 $file_buffer->{'song'});
376 }
377
378 close FILE;
379
380 $track_no++;
381 $rog_filename = "$conv_filename$track_no.$output_ext";
382 }
383
384 print STDERR "\n";
385
386 # include any metadata passed in from previous plugins
387 # note that this metadata is associated with the top level section
388 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
389 # do plugin specific processing of doc_obj
390 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
391 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
392 return -1;
393 }
394 # do any automatic metadata extraction
395 $self->auto_extract_metadata ($doc_obj);
396 # add an OID
397 $self->add_OID($doc_obj);
398
399 my $oid = $doc_obj->get_OID();
400 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
401
402 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
403 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
404 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
405
406 # if no title metadata defined, set it to filename minus extension
407 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
408 if (!defined $existing_title)
409 {
410 my $title = $doc_obj->get_metadata_element($topsection,"Source");
411 $title =~ s/\..*?$//g;
412 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
413
414 my $title_safe = $title;
415 $title_safe =~ s/\'/\\\\&apos;/g;
416 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
417 }
418
419 # process the document
420 $processor->process($doc_obj);
421 $self->cleanup_tmp_area();
422
423 $self->{'num_processed'} ++;
424
425 return 1;
426}
427
428
429# do plugin specific processing of doc_obj for HTML type
430sub process_type {
431 my $self = shift (@_);
432 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
433
434 my $conv_filename = $self->{'conv_filename'};
435 my $tmp_dirname = File::Basename::dirname($conv_filename);
436 my $tmp_tailname = File::Basename::basename($conv_filename);
437
438 my $converted_to = $self->{'converted_to'};
439 my $ret_val = 1;
440
441# $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
442# $tmp_dirname, $tmp_tailname,
443# $metadata, $doc_obj);
444
445 # associate original file with doc object
446 my $cursection = $doc_obj->get_top_section();
447 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
448 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
449
450 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
451 $doc_obj->add_metadata ($cursection, "srclink_file", "doc.$doc_ext");
452 $doc_obj->add_metadata ($cursection, "srclinkFile", "doc.$doc_ext");
453 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
454
455 return $ret_val;
456}
457
4581;
Note: See TracBrowser for help on using the repository browser.