root/main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm @ 24225

Revision 24225, 13.7 KB (checked in by ak19, 8 years ago)

Still on ticket 449. Now srclink_file metadata (contains an underscore that makes things difficult for GS3) is renamed to srclinkFile. Related commits are in GS2's runtime-src formattools.cpp and dublincore.cpp and GS3's default/transform/config_format.xsl and Action.java.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33use Config; # for getting the perlpath in the recommended way
34
35sub BEGIN {
36    @ConvertToRogPlugin::ISA = ('RogPlugin');
37}
38
39my $arguments = [
40         ];
41my $options = { 'name'     => "ConvertToRogPlugin",
42        'desc'     => "{ConvertToRogPlugin.desc}",
43        'abstract' => "yes",
44        'inherits' => "yes" };
45
46sub new {
47    my ($class) = shift (@_);
48    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
49    push(@$pluginlist, $class);
50
51    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
52    push(@{$hashArgOptLists->{"OptList"}},$options);
53
54    my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
55
56    $self->{'convert_to'} = "Rog";
57    $self->{'convert_to_ext'} = "rog";
58
59    return bless $self, $class;
60}
61
62
63sub begin {
64    my $self = shift (@_);
65   
66    $self->SUPER::begin(@_);
67
68    $self->{'docnum'} = 0;
69}
70
71sub end {
72    my ($self) = @_;
73
74    # nothing to do, but keep symmetric with begin function
75    $self->SUPER::end(@_);
76}
77
78
79# Run conversion utility on the input file. 
80#
81# The conversion takes place in a collection specific 'tmp' directory so
82# that we don't accidentally damage the input.
83#
84# The desired output type is indicated by $output_ext.  This is usually
85# something like "html" or "word", but can be "best" (or the empty string)
86# to indicate that the conversion utility should do the best it can.
87
88sub tmp_area_convert_file {
89    my $self = shift (@_);
90    my ($output_ext, $input_filename, $textref) = @_;
91
92    my $outhandle = $self->{'outhandle'};
93    my $convert_to = $self->{'convert_to'};
94    my $failhandle = $self->{'failhandle'};
95
96    # softlink to collection tmp dir
97    my $tmp_dirname
98    = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
99    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
100
101    # derive tmp filename from input filename
102    my ($tailname, $dirname, $suffix)
103    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
104
105    # Remove any white space from filename -- no risk of name collision, and
106    # makes later conversion by utils simpler. Leave spaces in path...
107    $tailname =~ s/\s+//g;
108    $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
109
110    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
111
112    &util::soft_link($input_filename, $tmp_filename);
113
114    my $verbosity = $self->{'verbosity'};
115    if ($verbosity > 0) {
116    print $outhandle "Converting $tailname$suffix to $convert_to format\n";
117    }
118
119    my $errlog = &util::filename_cat($tmp_dirname, "err.log");
120   
121    # Execute the conversion command and get the type of the result,
122    # making sure the converter gives us the appropriate output type
123    my $output_type = lc($convert_to);
124    my $cmd = "\"$Config{perlpath}\" -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
125    $output_type = `$cmd`;
126
127    # remove symbolic link to original file
128    &util::rm($tmp_filename);
129
130    # Check STDERR here
131    chomp $output_type;
132    if ($output_type eq "fail") {
133    print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
134    print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
135    $self->{'num_not_processed'} ++;
136    if (-s "$errlog") {
137        open(ERRLOG, "$errlog");
138        while (<ERRLOG>) {
139        print $outhandle "$_";
140        }
141        print $outhandle "\n";
142        close ERRLOG;
143    }
144    &util::rm("$errlog") if (-e "$errlog");
145    return "";
146    }
147
148    # store the *actual* output type and return the output filename
149    # it's possible we requested conversion to html, but only to text succeeded
150
151    $self->{'convert_to_ext'} = $output_type;
152    $self->{'converted_to'} = "Rog";
153
154    my $output_filename = $tmp_filename;
155
156    $output_filename =~ s/$suffix$//;
157
158    return $output_filename;
159}
160
161
162# Remove collection specific tmp directory and all its contents.
163
164sub cleanup_tmp_area {
165    my $self = shift (@_);
166
167    my $tmp_dirname
168    = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
169    &util::rm_r($tmp_dirname);
170    &util::mk_dir($tmp_dirname);
171}
172
173
174# Exact copy of read_rog_record from RogPlugin
175# Needed for FILE in right scope
176
177sub read_rog_record
178{
179    my ($self,$file_buffer, $docnum, $seclevel) = @_;
180
181    my $next_line = $file_buffer->{'next_line'};
182
183    return 0 if (!defined $next_line);
184
185    if ($next_line eq "")
186    {
187    my $line;
188    while(defined($line=<FILE>)) 
189    {
190        $line =~ s/\r$//;
191        $file_buffer->{'line_no'}++;
192        next if ($line =~ m/^\#/);
193        $next_line = $line;
194        last;
195    }
196    }
197       
198    if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
199    {
200    print STDERR "Error: Malformed Rog file: $next_line";
201    return 0;
202    }
203    else
204    {
205    # init default values
206    $file_buffer->{'song'}->{'tempo'}    = 120;
207    $file_buffer->{'song'}->{'ks_type'}  = 0;
208    $file_buffer->{'song'}->{'ks_num'}   = 0;
209    $file_buffer->{'song'}->{'metadata'} = [];
210    $file_buffer->{'song'}->{'content'}  = "";
211   
212    $file_buffer->{'song'}->{'subcol'} = $2;
213    $file_buffer->{'song'}->{'title'}  = $4;
214    $file_buffer->{'song'}->{'tval'}   = $6;
215
216    chomp($next_line);
217    my $content = $next_line;
218    if (defined $docnum)
219    {
220        $content.= " $docnum $seclevel";
221    }
222    $content .= "\n";
223
224    $file_buffer->{'song'}->{'content'} = $content;
225
226
227    my $line;
228    while(defined($line=<FILE>)) 
229    {
230        $line =~ s/\r$//;
231
232        $file_buffer->{'line_no'}++;
233        next if ($line =~ m/^\#/);
234   
235        if ($line =~ m/^song/)
236        {   
237        $file_buffer->{'next_line'} = $line;
238        return 1;
239        }
240        elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
241        {
242        $file_buffer->{'song'}->{'tempo'} = $2;
243        $file_buffer->{'song'}->{'content'} .= $line;
244        }
245        elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
246        {
247        $file_buffer->{'song'}->{'ks_type'} = $2;
248        $file_buffer->{'song'}->{'ks_num'}  = $4;
249        $file_buffer->{'song'}->{'content'} .= $line;     
250        }
251        elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
252        {
253        $file_buffer->{'song'}->{'ts_numer'} = $2;
254        $file_buffer->{'song'}->{'ts_denom'} = $4;
255        $file_buffer->{'song'}->{'content'} .= $line;
256        }
257        elsif ($line =~ m/^metadata ([^:]*): (.*)/)
258        {
259        push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
260        $file_buffer->{'song'}->{'content'} .= $line;
261        }
262        else
263        {
264        $file_buffer->{'song'}->{'content'} .= $line;
265        }
266    }
267   
268    $file_buffer->{'next_line'} = undef;
269    }
270
271    return 1;
272}
273
274# Override RogPlugin function so rog files are stored as sections (not docs)
275
276sub process_rog_record
277{
278    my ($self,$doc_obj,$cursection,$song) = @_;
279
280    $cursection =
281    $doc_obj->insert_section($cursection);
282    $self->{'docnum'}++;
283
284    my $title = $song->{'title'};
285    my $title_safe = $title;
286    $title_safe =~ s/\'/\\\\&apos;/g;
287
288    # add metadata
289    $doc_obj->add_metadata($cursection, "Tempo",         $song->{'tempo'});
290    $doc_obj->add_metadata($cursection, "KeySigType",    $song->{'ks_type'});
291    $doc_obj->add_metadata($cursection, "KeySigNum",     $song->{'ks_num'});
292    $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
293    $doc_obj->add_metadata($cursection, "Title",         $title);
294    $doc_obj->add_metadata($cursection, "TitleSafe",     $title_safe);
295    $doc_obj->add_metadata($cursection, "TVal",          $song->{'tval'});
296
297    foreach my $md ( @{$song->{'metadata'}} )
298    {
299    $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
300    }
301
302    # add contents as text
303    $doc_obj->add_text($cursection,$song->{'content'});
304
305    return $cursection;
306}
307
308
309
310# Override BasePlugin read
311# We don't want to get language encoding stuff until after we've converted
312# our file to Rog format
313sub read {
314    my $self = shift (@_);
315    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
316
317    my $outhandle = $self->{'outhandle'};
318
319    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
320    return undef unless $self->can_process_this_file($filename_full_path);
321 
322    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
323
324    # read in file ($text will be in utf8)
325    my $text = "";
326
327    my $output_ext = $self->{'convert_to_ext'};
328    my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
329
330    if ("$conv_filename" eq "") {return 0;} # allows continue on errors
331    $self->{'conv_filename'} = $conv_filename;
332
333
334    # create a new document
335    #my $doc_obj = new doc ($conv_filename, "indexed_doc");
336    # the original filename is used now
337    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
338    # the converted filename is set separately
339    $doc_obj->set_converted_filename($conv_filename);
340
341    my $topsection = $doc_obj->get_top_section();
342    my $cursection = $topsection;
343
344    $self->{'docnum'}++;
345    my $docnum = $self->{'docnum'};
346
347    my ($filemeta) = $file =~ /([^\\\/]+)$/;
348    my $plugin_filename_encoding = $self->{'filename_encoding'};
349    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
350    $self->set_Source_metadata($doc_obj, $conv_filename, $filename_encoding);
351   
352    if ($self->{'cover_image'}) {
353    $self->associate_cover_image($doc_obj, $filename_full_path);
354    }
355    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
356    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
357
358    my $track_no = "1";
359    my $rog_filename = "$conv_filename$track_no.$output_ext";
360    while (1)
361    {
362    last unless open (FILE, $rog_filename) ;
363
364    my $file_buffer = { line_no => 0, next_line => "", song => {} };
365   
366    while ($self->read_rog_record($file_buffer, $docnum, $track_no))
367    {
368        my $song = $file_buffer->{'song'};
369        my $content = $song->{'content'};
370        $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
371       
372        $cursection
373        = $self->process_rog_record($doc_obj,$cursection,
374                        $file_buffer->{'song'});
375    }
376
377    close FILE;
378
379    $track_no++;
380    $rog_filename = "$conv_filename$track_no.$output_ext";
381    }
382
383    print STDERR "\n";
384
385    # include any metadata passed in from previous plugins
386    # note that this metadata is associated with the top level section
387    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
388    # do plugin specific processing of doc_obj
389    unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
390    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
391    return -1;
392    }
393    # do any automatic metadata extraction
394    $self->auto_extract_metadata ($doc_obj);
395    # add an OID
396    $self->add_OID($doc_obj);
397
398    my $oid = $doc_obj->get_OID();
399    my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
400
401    $doc_obj->add_utf8_metadata ($topsection, "audiolink",  $appletlink);
402    $doc_obj->add_utf8_metadata ($topsection, "audioicon",  "_iconaudio_");
403    $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
404
405    # if no title metadata defined, set it to filename minus extension
406    my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
407    if (!defined $existing_title)
408    {
409    my $title = $doc_obj->get_metadata_element($topsection,"Source");
410    $title =~ s/\..*?$//g;
411    $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
412
413    my $title_safe = $title;
414    $title_safe =~ s/\'/\\\\&apos;/g;
415    $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
416    }
417
418    # process the document
419    $processor->process($doc_obj);
420    $self->cleanup_tmp_area();
421
422    $self->{'num_processed'} ++;
423
424    return 1;
425}
426
427
428# do plugin specific processing of doc_obj for HTML type
429sub process_type {
430    my $self = shift (@_);
431    my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
432   
433    my $conv_filename = $self->{'conv_filename'};
434    my $tmp_dirname = File::Basename::dirname($conv_filename);
435    my $tmp_tailname = File::Basename::basename($conv_filename);
436   
437    my $converted_to = $self->{'converted_to'};
438    my $ret_val = 1;   
439
440#   $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
441#                $tmp_dirname, $tmp_tailname,
442#                $metadata, $doc_obj);
443
444    # associate original file with doc object
445    my $cursection = $doc_obj->get_top_section();
446    my $filename = &util::filename_cat($base_dir, $file);
447    $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
448
449    # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
450    $doc_obj->add_metadata ($cursection, "srclink_file", "doc.$doc_ext");
451    $doc_obj->add_metadata ($cursection, "srclinkFile", "doc.$doc_ext");
452    $doc_obj->add_utf8_metadata ($cursection, "srcicon",  "_icon".$doc_ext."_");
453
454    return $ret_val;
455}
456
4571;
Note: See TracBrowser for help on using the browser.