source: main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm@ 21760

Last change on this file since 21760 was 21760, checked in by kjdon, 14 years ago

srclink now generated dynamically at runtime. instead of storing srclink metadata, we store srclink_file metadata, which can be a value (doc.doc) or a metadata format element (eg [SourceFile]).

  • Property svn:keywords set to Author Date Id Revision
File size: 13.5 KB
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 @ConvertToRogPlugin::ISA = ('RogPlugin');
36}
37
38my $arguments = [
39 ];
40my $options = { 'name' => "ConvertToRogPlugin",
41 'desc' => "{ConvertToRogPlugin.desc}",
42 'abstract' => "yes",
43 'inherits' => "yes" };
44
45sub new {
46 my ($class) = shift (@_);
47 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
48 push(@$pluginlist, $class);
49
50 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
51 push(@{$hashArgOptLists->{"OptList"}},$options);
52
53 my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
54
55 $self->{'convert_to'} = "Rog";
56 $self->{'convert_to_ext'} = "rog";
57
58 return bless $self, $class;
59}
60
61
62sub begin {
63 my $self = shift (@_);
64
65 $self->SUPER::begin(@_);
66
67 $self->{'docnum'} = 0;
68}
69
70sub end {
71 my ($self) = @_;
72
73 # nothing to do, but keep symmetric with begin function
74 $self->SUPER::end(@_);
75}
76
77
78# Run conversion utility on the input file.
79#
80# The conversion takes place in a collection specific 'tmp' directory so
81# that we don't accidentally damage the input.
82#
83# The desired output type is indicated by $output_ext. This is usually
84# something like "html" or "word", but can be "best" (or the empty string)
85# to indicate that the conversion utility should do the best it can.
86
87sub tmp_area_convert_file {
88 my $self = shift (@_);
89 my ($output_ext, $input_filename, $textref) = @_;
90
91 my $outhandle = $self->{'outhandle'};
92 my $convert_to = $self->{'convert_to'};
93 my $failhandle = $self->{'failhandle'};
94
95 # softlink to collection tmp dir
96 my $tmp_dirname
97 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
98 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
99
100 # derive tmp filename from input filename
101 my ($tailname, $dirname, $suffix)
102 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
103
104 # Remove any white space from filename -- no risk of name collision, and
105 # makes later conversion by utils simpler. Leave spaces in path...
106 $tailname =~ s/\s+//g;
107 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
108
109 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
110
111 &util::soft_link($input_filename, $tmp_filename);
112
113 my $verbosity = $self->{'verbosity'};
114 if ($verbosity > 0) {
115 print $outhandle "Converting $tailname$suffix to $convert_to format\n";
116 }
117
118 my $errlog = &util::filename_cat($tmp_dirname, "err.log");
119
120 # Execute the conversion command and get the type of the result,
121 # making sure the converter gives us the appropriate output type
122 my $output_type = lc($convert_to);
123 my $cmd = "perl -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
124 $output_type = `$cmd`;
125
126 # remove symbolic link to original file
127 &util::rm($tmp_filename);
128
129 # Check STDERR here
130 chomp $output_type;
131 if ($output_type eq "fail") {
132 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
133 print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
134 $self->{'num_not_processed'} ++;
135 if (-s "$errlog") {
136 open(ERRLOG, "$errlog");
137 while (<ERRLOG>) {
138 print $outhandle "$_";
139 }
140 print $outhandle "\n";
141 close ERRLOG;
142 }
143 &util::rm("$errlog") if (-e "$errlog");
144 return "";
145 }
146
147 # store the *actual* output type and return the output filename
148 # it's possible we requested conversion to html, but only to text succeeded
149
150 $self->{'convert_to_ext'} = $output_type;
151 $self->{'converted_to'} = "Rog";
152
153 my $output_filename = $tmp_filename;
154
155 $output_filename =~ s/$suffix$//;
156
157 return $output_filename;
158}
159
160
161# Remove collection specific tmp directory and all its contents.
162
163sub cleanup_tmp_area {
164 my $self = shift (@_);
165
166 my $tmp_dirname
167 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
168 &util::rm_r($tmp_dirname);
169 &util::mk_dir($tmp_dirname);
170}
171
172
173# Exact copy of read_rog_record from RogPlugin
174# Needed for FILE in right scope
175
176sub read_rog_record
177{
178 my ($self,$file_buffer, $docnum, $seclevel) = @_;
179
180 my $next_line = $file_buffer->{'next_line'};
181
182 return 0 if (!defined $next_line);
183
184 if ($next_line eq "")
185 {
186 my $line;
187 while(defined($line=<FILE>))
188 {
189 $line =~ s/\r$//;
190 $file_buffer->{'line_no'}++;
191 next if ($line =~ m/^\#/);
192 $next_line = $line;
193 last;
194 }
195 }
196
197 if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
198 {
199 print STDERR "Error: Malformed Rog file: $next_line";
200 return 0;
201 }
202 else
203 {
204 # init default values
205 $file_buffer->{'song'}->{'tempo'} = 120;
206 $file_buffer->{'song'}->{'ks_type'} = 0;
207 $file_buffer->{'song'}->{'ks_num'} = 0;
208 $file_buffer->{'song'}->{'metadata'} = [];
209 $file_buffer->{'song'}->{'content'} = "";
210
211 $file_buffer->{'song'}->{'subcol'} = $2;
212 $file_buffer->{'song'}->{'title'} = $4;
213 $file_buffer->{'song'}->{'tval'} = $6;
214
215 chomp($next_line);
216 my $content = $next_line;
217 if (defined $docnum)
218 {
219 $content.= " $docnum $seclevel";
220 }
221 $content .= "\n";
222
223 $file_buffer->{'song'}->{'content'} = $content;
224
225
226 my $line;
227 while(defined($line=<FILE>))
228 {
229 $line =~ s/\r$//;
230
231 $file_buffer->{'line_no'}++;
232 next if ($line =~ m/^\#/);
233
234 if ($line =~ m/^song/)
235 {
236 $file_buffer->{'next_line'} = $line;
237 return 1;
238 }
239 elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
240 {
241 $file_buffer->{'song'}->{'tempo'} = $2;
242 $file_buffer->{'song'}->{'content'} .= $line;
243 }
244 elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
245 {
246 $file_buffer->{'song'}->{'ks_type'} = $2;
247 $file_buffer->{'song'}->{'ks_num'} = $4;
248 $file_buffer->{'song'}->{'content'} .= $line;
249 }
250 elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
251 {
252 $file_buffer->{'song'}->{'ts_numer'} = $2;
253 $file_buffer->{'song'}->{'ts_denom'} = $4;
254 $file_buffer->{'song'}->{'content'} .= $line;
255 }
256 elsif ($line =~ m/^metadata ([^:]*): (.*)/)
257 {
258 push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
259 $file_buffer->{'song'}->{'content'} .= $line;
260 }
261 else
262 {
263 $file_buffer->{'song'}->{'content'} .= $line;
264 }
265 }
266
267 $file_buffer->{'next_line'} = undef;
268 }
269
270 return 1;
271}
272
273# Override RogPlugin function so rog files are stored as sections (not docs)
274
275sub process_rog_record
276{
277 my ($self,$doc_obj,$cursection,$song) = @_;
278
279 $cursection =
280 $doc_obj->insert_section($cursection);
281 $self->{'docnum'}++;
282
283 my $title = $song->{'title'};
284 my $title_safe = $title;
285 $title_safe =~ s/\'/\\\\&apos;/g;
286
287 # add metadata
288 $doc_obj->add_metadata($cursection, "Tempo", $song->{'tempo'});
289 $doc_obj->add_metadata($cursection, "KeySigType", $song->{'ks_type'});
290 $doc_obj->add_metadata($cursection, "KeySigNum", $song->{'ks_num'});
291 $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
292 $doc_obj->add_metadata($cursection, "Title", $title);
293 $doc_obj->add_metadata($cursection, "TitleSafe", $title_safe);
294 $doc_obj->add_metadata($cursection, "TVal", $song->{'tval'});
295
296 foreach my $md ( @{$song->{'metadata'}} )
297 {
298 $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
299 }
300
301 # add contents as text
302 $doc_obj->add_text($cursection,$song->{'content'});
303
304 return $cursection;
305}
306
307
308
309# Override BasePlugin read
310# We don't want to get language encoding stuff until after we've converted
311# our file to Rog format
312sub read {
313 my $self = shift (@_);
314 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
315
316 my $outhandle = $self->{'outhandle'};
317
318 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
319 return undef unless $self->can_process_this_file($filename_full_path);
320
321 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
322
323 # read in file ($text will be in utf8)
324 my $text = "";
325
326 my $output_ext = $self->{'convert_to_ext'};
327 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
328
329 if ("$conv_filename" eq "") {return 0;} # allows continue on errors
330 $self->{'conv_filename'} = $conv_filename;
331
332
333 # create a new document
334 #my $doc_obj = new doc ($conv_filename, "indexed_doc");
335 # the original filename is used now
336 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
337 # the converted filename is set separately
338 $doc_obj->set_converted_filename($conv_filename);
339
340 my $topsection = $doc_obj->get_top_section();
341 my $cursection = $topsection;
342
343 $self->{'docnum'}++;
344 my $docnum = $self->{'docnum'};
345
346 my ($filemeta) = $file =~ /([^\\\/]+)$/;
347 $self->set_Source_metadata($doc_obj, $filemeta);
348
349 if ($self->{'cover_image'}) {
350 $self->associate_cover_image($doc_obj, $filename_full_path);
351 }
352 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
353 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
354
355 my $track_no = "1";
356 my $rog_filename = "$conv_filename$track_no.$output_ext";
357 while (1)
358 {
359 last unless open (FILE, $rog_filename) ;
360
361 my $file_buffer = { line_no => 0, next_line => "", song => {} };
362
363 while ($self->read_rog_record($file_buffer, $docnum, $track_no))
364 {
365 my $song = $file_buffer->{'song'};
366 my $content = $song->{'content'};
367 $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
368
369 $cursection
370 = $self->process_rog_record($doc_obj,$cursection,
371 $file_buffer->{'song'});
372 }
373
374 close FILE;
375
376 $track_no++;
377 $rog_filename = "$conv_filename$track_no.$output_ext";
378 }
379
380 print STDERR "\n";
381
382 # include any metadata passed in from previous plugins
383 # note that this metadata is associated with the top level section
384 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
385 # do plugin specific processing of doc_obj
386 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
387 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
388 return -1;
389 }
390 # do any automatic metadata extraction
391 $self->auto_extract_metadata ($doc_obj);
392 # add an OID
393 $self->add_OID($doc_obj);
394
395 my $oid = $doc_obj->get_OID();
396 my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
397
398 $doc_obj->add_utf8_metadata ($topsection, "audiolink", $appletlink);
399 $doc_obj->add_utf8_metadata ($topsection, "audioicon", "_iconaudio_");
400 $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
401
402 # if no title metadata defined, set it to filename minus extension
403 my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
404 if (!defined $existing_title)
405 {
406 my $title = $doc_obj->get_metadata_element($topsection,"Source");
407 $title =~ s/\..*?$//g;
408 $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
409
410 my $title_safe = $title;
411 $title_safe =~ s/\'/\\\\&apos;/g;
412 $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
413 }
414
415 # process the document
416 $processor->process($doc_obj);
417 $self->cleanup_tmp_area();
418
419 $self->{'num_processed'} ++;
420
421 return 1;
422}
423
424
425# do plugin specific processing of doc_obj for HTML type
426sub process_type {
427 my $self = shift (@_);
428 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
429
430 my $conv_filename = $self->{'conv_filename'};
431 my $tmp_dirname = File::Basename::dirname($conv_filename);
432 my $tmp_tailname = File::Basename::basename($conv_filename);
433
434 my $converted_to = $self->{'converted_to'};
435 my $ret_val = 1;
436
437# $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
438# $tmp_dirname, $tmp_tailname,
439# $metadata, $doc_obj);
440
441 # associate original file with doc object
442 my $cursection = $doc_obj->get_top_section();
443 my $filename = &util::filename_cat($base_dir, $file);
444 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
445
446 #my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/doc.$doc_ext\">";
447 #$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
448 $doc_obj->add_metadata ($section, "srclink_file", "doc.$doc_ext");
449 $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
450 #$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
451 return $ret_val;
452}
453
4541;
Note: See TracBrowser for help on using the repository browser.