root/main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm @ 21764

Revision 21764, 13.5 KB (checked in by kjdon, 10 years ago)

fixed up all my copy and paste errors. doh.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ConvertToRogPlugin.pm -- plugin that inherits from RogPlugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package ConvertToRogPlugin;
29
30use RogPlugin;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35    @ConvertToRogPlugin::ISA = ('RogPlugin');
36}
37
38my $arguments = [
39         ];
40my $options = { 'name'     => "ConvertToRogPlugin",
41        'desc'     => "{ConvertToRogPlugin.desc}",
42        'abstract' => "yes",
43        'inherits' => "yes" };
44
45sub new {
46    my ($class) = shift (@_);
47    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
48    push(@$pluginlist, $class);
49
50    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
51    push(@{$hashArgOptLists->{"OptList"}},$options);
52
53    my $self = new RogPlugin($pluginlist, $inputargs, $hashArgOptLists);
54
55    $self->{'convert_to'} = "Rog";
56    $self->{'convert_to_ext'} = "rog";
57
58    return bless $self, $class;
59}
60
61
62sub begin {
63    my $self = shift (@_);
64   
65    $self->SUPER::begin(@_);
66
67    $self->{'docnum'} = 0;
68}
69
70sub end {
71    my ($self) = @_;
72
73    # nothing to do, but keep symmetric with begin function
74    $self->SUPER::end(@_);
75}
76
77
78# Run conversion utility on the input file. 
79#
80# The conversion takes place in a collection specific 'tmp' directory so
81# that we don't accidentally damage the input.
82#
83# The desired output type is indicated by $output_ext.  This is usually
84# something like "html" or "word", but can be "best" (or the empty string)
85# to indicate that the conversion utility should do the best it can.
86
87sub tmp_area_convert_file {
88    my $self = shift (@_);
89    my ($output_ext, $input_filename, $textref) = @_;
90
91    my $outhandle = $self->{'outhandle'};
92    my $convert_to = $self->{'convert_to'};
93    my $failhandle = $self->{'failhandle'};
94
95    # softlink to collection tmp dir
96    my $tmp_dirname
97    = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
98    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
99
100    # derive tmp filename from input filename
101    my ($tailname, $dirname, $suffix)
102    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
103
104    # Remove any white space from filename -- no risk of name collision, and
105    # makes later conversion by utils simpler. Leave spaces in path...
106    $tailname =~ s/\s+//g;
107    $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
108
109    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
110
111    &util::soft_link($input_filename, $tmp_filename);
112
113    my $verbosity = $self->{'verbosity'};
114    if ($verbosity > 0) {
115    print $outhandle "Converting $tailname$suffix to $convert_to format\n";
116    }
117
118    my $errlog = &util::filename_cat($tmp_dirname, "err.log");
119   
120    # Execute the conversion command and get the type of the result,
121    # making sure the converter gives us the appropriate output type
122    my $output_type = lc($convert_to);
123    my $cmd = "perl -S gsMusicConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
124    $output_type = `$cmd`;
125
126    # remove symbolic link to original file
127    &util::rm($tmp_filename);
128
129    # Check STDERR here
130    chomp $output_type;
131    if ($output_type eq "fail") {
132    print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
133    print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n";
134    $self->{'num_not_processed'} ++;
135    if (-s "$errlog") {
136        open(ERRLOG, "$errlog");
137        while (<ERRLOG>) {
138        print $outhandle "$_";
139        }
140        print $outhandle "\n";
141        close ERRLOG;
142    }
143    &util::rm("$errlog") if (-e "$errlog");
144    return "";
145    }
146
147    # store the *actual* output type and return the output filename
148    # it's possible we requested conversion to html, but only to text succeeded
149
150    $self->{'convert_to_ext'} = $output_type;
151    $self->{'converted_to'} = "Rog";
152
153    my $output_filename = $tmp_filename;
154
155    $output_filename =~ s/$suffix$//;
156
157    return $output_filename;
158}
159
160
161# Remove collection specific tmp directory and all its contents.
162
163sub cleanup_tmp_area {
164    my $self = shift (@_);
165
166    my $tmp_dirname
167    = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
168    &util::rm_r($tmp_dirname);
169    &util::mk_dir($tmp_dirname);
170}
171
172
173# Exact copy of read_rog_record from RogPlugin
174# Needed for FILE in right scope
175
176sub read_rog_record
177{
178    my ($self,$file_buffer, $docnum, $seclevel) = @_;
179
180    my $next_line = $file_buffer->{'next_line'};
181
182    return 0 if (!defined $next_line);
183
184    if ($next_line eq "")
185    {
186    my $line;
187    while(defined($line=<FILE>)) 
188    {
189        $line =~ s/\r$//;
190        $file_buffer->{'line_no'}++;
191        next if ($line =~ m/^\#/);
192        $next_line = $line;
193        last;
194    }
195    }
196       
197    if ($next_line !~ m/^song( +)\"([^\"]*)\"( +)\"([^\"]*)\"( +)(\d+)( *)$/)
198    {
199    print STDERR "Error: Malformed Rog file: $next_line";
200    return 0;
201    }
202    else
203    {
204    # init default values
205    $file_buffer->{'song'}->{'tempo'}    = 120;
206    $file_buffer->{'song'}->{'ks_type'}  = 0;
207    $file_buffer->{'song'}->{'ks_num'}   = 0;
208    $file_buffer->{'song'}->{'metadata'} = [];
209    $file_buffer->{'song'}->{'content'}  = "";
210   
211    $file_buffer->{'song'}->{'subcol'} = $2;
212    $file_buffer->{'song'}->{'title'}  = $4;
213    $file_buffer->{'song'}->{'tval'}   = $6;
214
215    chomp($next_line);
216    my $content = $next_line;
217    if (defined $docnum)
218    {
219        $content.= " $docnum $seclevel";
220    }
221    $content .= "\n";
222
223    $file_buffer->{'song'}->{'content'} = $content;
224
225
226    my $line;
227    while(defined($line=<FILE>)) 
228    {
229        $line =~ s/\r$//;
230
231        $file_buffer->{'line_no'}++;
232        next if ($line =~ m/^\#/);
233   
234        if ($line =~ m/^song/)
235        {   
236        $file_buffer->{'next_line'} = $line;
237        return 1;
238        }
239        elsif ($line =~ m/^tempo( +)(\d+)( *)$/)
240        {
241        $file_buffer->{'song'}->{'tempo'} = $2;
242        $file_buffer->{'song'}->{'content'} .= $line;
243        }
244        elsif ($line =~ m/^keysig( +)(\d+)( +)(\d+)( *)$/)
245        {
246        $file_buffer->{'song'}->{'ks_type'} = $2;
247        $file_buffer->{'song'}->{'ks_num'}  = $4;
248        $file_buffer->{'song'}->{'content'} .= $line;     
249        }
250        elsif ($line =~ m/^timesig( +)(\d+)( +)(\d+)( *)$/)
251        {
252        $file_buffer->{'song'}->{'ts_numer'} = $2;
253        $file_buffer->{'song'}->{'ts_denom'} = $4;
254        $file_buffer->{'song'}->{'content'} .= $line;
255        }
256        elsif ($line =~ m/^metadata ([^:]*): (.*)/)
257        {
258        push(@{$file_buffer->{'song'}->{'metadata'}},[$1,$2]);
259        $file_buffer->{'song'}->{'content'} .= $line;
260        }
261        else
262        {
263        $file_buffer->{'song'}->{'content'} .= $line;
264        }
265    }
266   
267    $file_buffer->{'next_line'} = undef;
268    }
269
270    return 1;
271}
272
273# Override RogPlugin function so rog files are stored as sections (not docs)
274
275sub process_rog_record
276{
277    my ($self,$doc_obj,$cursection,$song) = @_;
278
279    $cursection =
280    $doc_obj->insert_section($cursection);
281    $self->{'docnum'}++;
282
283    my $title = $song->{'title'};
284    my $title_safe = $title;
285    $title_safe =~ s/\'/\\\\&apos;/g;
286
287    # add metadata
288    $doc_obj->add_metadata($cursection, "Tempo",         $song->{'tempo'});
289    $doc_obj->add_metadata($cursection, "KeySigType",    $song->{'ks_type'});
290    $doc_obj->add_metadata($cursection, "KeySigNum",     $song->{'ks_num'});
291    $doc_obj->add_metadata($cursection, "SubCollection", $song->{'subcol'});
292    $doc_obj->add_metadata($cursection, "Title",         $title);
293    $doc_obj->add_metadata($cursection, "TitleSafe",     $title_safe);
294    $doc_obj->add_metadata($cursection, "TVal",          $song->{'tval'});
295
296    foreach my $md ( @{$song->{'metadata'}} )
297    {
298    $doc_obj->add_metadata($cursection, $md->[0], $md->[1]);
299    }
300
301    # add contents as text
302    $doc_obj->add_text($cursection,$song->{'content'});
303
304    return $cursection;
305}
306
307
308
309# Override BasePlugin read
310# We don't want to get language encoding stuff until after we've converted
311# our file to Rog format
312sub read {
313    my $self = shift (@_);
314    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
315
316    my $outhandle = $self->{'outhandle'};
317
318    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
319    return undef unless $self->can_process_this_file($filename_full_path);
320 
321    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
322
323    # read in file ($text will be in utf8)
324    my $text = "";
325
326    my $output_ext = $self->{'convert_to_ext'};
327    my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
328
329    if ("$conv_filename" eq "") {return 0;} # allows continue on errors
330    $self->{'conv_filename'} = $conv_filename;
331
332
333    # create a new document
334    #my $doc_obj = new doc ($conv_filename, "indexed_doc");
335    # the original filename is used now
336    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
337    # the converted filename is set separately
338    $doc_obj->set_converted_filename($conv_filename);
339
340    my $topsection = $doc_obj->get_top_section();
341    my $cursection = $topsection;
342
343    $self->{'docnum'}++;
344    my $docnum = $self->{'docnum'};
345
346    my ($filemeta) = $file =~ /([^\\\/]+)$/;
347    $self->set_Source_metadata($doc_obj, $filemeta);
348   
349    if ($self->{'cover_image'}) {
350    $self->associate_cover_image($doc_obj, $filename_full_path);
351    }
352    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
353    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
354
355    my $track_no = "1";
356    my $rog_filename = "$conv_filename$track_no.$output_ext";
357    while (1)
358    {
359    last unless open (FILE, $rog_filename) ;
360
361    my $file_buffer = { line_no => 0, next_line => "", song => {} };
362   
363    while ($self->read_rog_record($file_buffer, $docnum, $track_no))
364    {
365        my $song = $file_buffer->{'song'};
366        my $content = $song->{'content'};
367        $content =~ s/^song\w+(.*)$/song $1 X.$track_no/;
368       
369        $cursection
370        = $self->process_rog_record($doc_obj,$cursection,
371                        $file_buffer->{'song'});
372    }
373
374    close FILE;
375
376    $track_no++;
377    $rog_filename = "$conv_filename$track_no.$output_ext";
378    }
379
380    print STDERR "\n";
381
382    # include any metadata passed in from previous plugins
383    # note that this metadata is associated with the top level section
384    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
385    # do plugin specific processing of doc_obj
386    unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
387    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
388    return -1;
389    }
390    # do any automatic metadata extraction
391    $self->auto_extract_metadata ($doc_obj);
392    # add an OID
393    $self->add_OID($doc_obj);
394
395    my $oid = $doc_obj->get_OID();
396    my $appletlink = "<a href=\"javascript:meldexout(\'$oid\','[TitleSafe]')\">";
397
398    $doc_obj->add_utf8_metadata ($topsection, "audiolink",  $appletlink);
399    $doc_obj->add_utf8_metadata ($topsection, "audioicon",  "_iconaudio_");
400    $doc_obj->add_utf8_metadata ($topsection, "/audiolink", "</a>");
401
402    # if no title metadata defined, set it to filename minus extension
403    my $existing_title = $doc_obj->get_metadata_element($topsection,"Title");
404    if (!defined $existing_title)
405    {
406    my $title = $doc_obj->get_metadata_element($topsection,"Source");
407    $title =~ s/\..*?$//g;
408    $doc_obj->add_utf8_metadata ($topsection, "Title", $title);
409
410    my $title_safe = $title;
411    $title_safe =~ s/\'/\\\\&apos;/g;
412    $doc_obj->add_utf8_metadata ($topsection, "TitleSafe", $title_safe);
413    }
414
415    # process the document
416    $processor->process($doc_obj);
417    $self->cleanup_tmp_area();
418
419    $self->{'num_processed'} ++;
420
421    return 1;
422}
423
424
425# do plugin specific processing of doc_obj for HTML type
426sub process_type {
427    my $self = shift (@_);
428    my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
429   
430    my $conv_filename = $self->{'conv_filename'};
431    my $tmp_dirname = File::Basename::dirname($conv_filename);
432    my $tmp_tailname = File::Basename::basename($conv_filename);
433   
434    my $converted_to = $self->{'converted_to'};
435    my $ret_val = 1;   
436
437#   $ret_val = &RogPlugin::process($self, $textref, $pluginfo,
438#                $tmp_dirname, $tmp_tailname,
439#                $metadata, $doc_obj);
440
441    # associate original file with doc object
442    my $cursection = $doc_obj->get_top_section();
443    my $filename = &util::filename_cat($base_dir, $file);
444    $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
445
446    #my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/doc.$doc_ext\">";
447    #$doc_obj->add_utf8_metadata ($cursection, "srclink",  $doclink);
448    $doc_obj->add_metadata ($cursection, "srclink_file", "doc.$doc_ext");
449    $doc_obj->add_utf8_metadata ($cursection, "srcicon",  "_icon".$doc_ext."_");
450    #$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
451    return $ret_val;
452}
453
4541;
Note: See TracBrowser for help on using the browser.