source: trunk/gsdl/perllib/plugouts/BasPlugout.pm@ 13064

Last change on this file since 13064 was 13064, checked in by shaoqun, 18 years ago

made it work for path with spaces

  • Property svn:keywords set to Author Date Id Revision
File size: 19.6 KB
Line 
1###########################################################################
2#
3# BasPlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use gsprintf 'gsprintf';
35use printusage;
36use parse2;
37
38# suppress the annoying "subroutine redefined" warning that various
39# gets cause under perl 5.6
40$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
41
42my $arguments = [
43 { 'name' => "group_size",
44 'desc' => "{BasPlugout.group_size}",
45 'type' => "int",
46 'deft' => "1",
47 'reqd' => "no",
48 'hiddengli' => "no"},
49 { 'name' => "output_info",
50 'desc' => "{BasPlugout.output_info}",
51 'type' => "string",
52 'reqd' => "yes",
53 'hiddengli' => "yes"},
54 { 'name' => "xslt_file",
55 'desc' => "{BasPlugout.xslt_file}",
56 'type' => "string",
57 'reqd' => "no",
58 'hiddengli' => "no"},
59 { 'name' => "output_handle",
60 'desc' => "{BasPlugout.output_handle}",
61 'type' => "string",
62 'deft' => 'STDERR',
63 'reqd' => "no",
64 'hiddengli' => "yes"},
65 { 'name' => "verbosity",
66 'desc' => "{BasPlugout.verbosity}",
67 'type' => "int",
68 'deft' => "0",
69 'reqd' => "no",
70 'hiddengli' => "no"},
71 { 'name' => "gzip_output",
72 'desc' => "{BasPlugout.gzip_output}",
73 'type' => "flag",
74 'reqd' => "no",
75 'hiddengli' => "no"}
76];
77
78my $options = { 'name' => "BasPlugout",
79 'desc' => "{BasPlugout.desc}",
80 'abstract' => "yes",
81 'inherits' => "no",
82 'args' => $arguments};
83
84sub new
85{
86 my $class = shift (@_);
87
88 my ($plugoutlist,$args,$hashArgOptLists) = @_;
89 push(@$plugoutlist, $class);
90
91 my $strPlugoutName = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
92
93 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
94 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
95
96 my $self = {};
97 $self->{'plugout_type'} = $class;
98 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
99 $self->{"info_only"} = 0;
100
101 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
102 # the args, just return the object.
103 foreach my $strArg (@{$args})
104 {
105 if(defined $strArg && $strArg eq "-gsdlinfo")
106 {
107 $self->{"info_only"} = 1;
108 return bless $self, $class;
109 }
110 }
111
112 delete $self->{"info_only"};
113
114 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
115 {
116 my $classTempClass = bless $self, $class;
117 print STDERR "<BadPlugout d=$self->{'plugout_name'}>\n";
118 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $self->{'plugout_name'});
119 $classTempClass->print_txt_usage(""); # Use default resource bundle
120 die "\n";
121 }
122
123
124 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
125 {
126 ##$self->{'xslt_file'} =~ s/\"//g;##working on Windows???
127 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n" and die "\n" unless (-e $self->{'xslt_file'});
128 }
129
130 $self->{'gs_count'} = 0;
131
132 $self->{'keep_import_structure'} = 0;
133
134 return bless $self, $class;
135
136}
137
138sub print_xml_usage
139{
140 my $self = shift(@_);
141 my $header = shift(@_);
142 my $high_level_information_only = shift(@_);
143
144 # XML output is always in UTF-8
145 gsprintf::output_strings_in_UTF8;
146
147 if ($header) {
148 &PrintUsage::print_xml_header("plugout");
149 }
150 $self->print_xml($high_level_information_only);
151}
152
153
154sub print_xml
155{
156 my $self = shift(@_);
157 my $high_level_information_only = shift(@_);
158
159 my $optionlistref = $self->{'option_list'};
160 my @optionlist = @$optionlistref;
161 my $plugoutoptions = shift(@$optionlistref);
162 return if (!defined($plugoutoptions));
163
164 gsprintf(STDERR, "<PlugoutInfo>\n");
165 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
166 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
167 $desc =~ s/</&amp;lt;/g; # doubly escaped
168 $desc =~ s/>/&amp;gt;/g;
169 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
170 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
171 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
172 unless (defined($high_level_information_only)) {
173 gsprintf(STDERR, " <Arguments>\n");
174 if (defined($plugoutoptions->{'args'})) {
175 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
176 }
177 gsprintf(STDERR, " </Arguments>\n");
178
179 # Recurse up the plugout hierarchy
180 $self->print_xml();
181 }
182 gsprintf(STDERR, "</PlugoutInfo>\n");
183}
184
185
186sub print_txt_usage
187{
188 my $self = shift(@_);
189
190 # Print the usage message for a plugout (recursively)
191 my $descoffset = $self->determine_description_offset(0);
192 $self->print_plugout_usage($descoffset, 1);
193}
194
195sub determine_description_offset
196{
197 my $self = shift(@_);
198 my $maxoffset = shift(@_);
199
200 my $optionlistref = $self->{'option_list'};
201 my @optionlist = @$optionlistref;
202 my $plugoutoptions = pop(@$optionlistref);
203 return $maxoffset if (!defined($plugoutoptions));
204
205 # Find the length of the longest option string of this download
206 my $plugoutargs = $plugoutoptions->{'args'};
207 if (defined($plugoutargs)) {
208 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
209 if ($longest > $maxoffset) {
210 $maxoffset = $longest;
211 }
212 }
213
214 # Recurse up the download hierarchy
215 $maxoffset = $self->determine_description_offset($maxoffset);
216 $self->{'option_list'} = \@optionlist;
217 return $maxoffset;
218}
219
220
221sub print_plugout_usage
222{
223 my $self = shift(@_);
224 my $descoffset = shift(@_);
225 my $isleafclass = shift(@_);
226
227 my $optionlistref = $self->{'option_list'};
228 my @optionlist = @$optionlistref;
229 my $plugoutoptions = shift(@$optionlistref);
230 return if (!defined($plugoutoptions));
231
232 my $plugoutname = $plugoutoptions->{'name'};
233 my $plugoutargs = $plugoutoptions->{'args'};
234 my $plugoutdesc = $plugoutoptions->{'desc'};
235
236 # Produce the usage information using the data structure above
237 if ($isleafclass) {
238 if (defined($plugoutdesc)) {
239 gsprintf(STDERR, "$plugoutdesc\n\n");
240 }
241 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
242 }
243
244 # Display the download options, if there are some
245 if (defined($plugoutargs)) {
246 # Calculate the column offset of the option descriptions
247 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
248
249 if ($isleafclass) {
250 gsprintf(STDERR, " {common.specific_options}:\n");
251 }
252 else {
253 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
254 }
255
256 # Display the download options
257 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
258 }
259
260 # Recurse up the download hierarchy
261 $self->print_plugout_usage($descoffset, 0);
262 $self->{'option_list'} = \@optionlist;
263}
264
265
266sub error
267{
268 my ($strFunctionName,$strError) = @_;
269 {
270 print "Error occoured in BasPlugout.pm\n".
271 "In Function: ".$strFunctionName."\n".
272 "Error Message: ".$strError."\n";
273 exit(-1);
274 }
275}
276
277# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
278sub set_OIDtype {
279 my $self = shift (@_);
280 my ($type, $metadata) = @_;
281
282 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
283 $self->{'OIDtype'} = $type;
284 } else {
285 $self->{'OIDtype'} = "hash";
286 }
287 if ($type =~ /^assigned$/) {
288 if (defined $metadata) {
289 $self->{'OIDmetadata'} = $metadata;
290 } else {
291 $self->{'OIDmetadata'} = "dc.Identifier";
292 }
293 }
294}
295
296sub set_output_dir
297{
298 my $self = shift @_;
299 my ($output_dir) = @_;
300
301 $self->{'output_dir'} = $output_dir;
302}
303
304sub setoutputdir
305{
306 my $self = shift @_;
307 my ($output_dir) = @_;
308
309 $self->{'output_dir'} = $output_dir;
310}
311
312sub get_output_dir
313{
314 my $self = shift (@_);
315
316 return $self->{'output_dir'};
317}
318
319sub getoutputdir
320{
321 my $self = shift (@_);
322
323 return $self->{'output_dir'};
324}
325
326sub getoutputinfo
327{
328 my $self = shift (@_);
329
330 return $self->{'output_info'};
331}
332
333
334sub get_output_handler
335{
336 my $self = shift (@_);
337
338 my ($output_file_name) = @_;
339
340 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
341
342 return *OUTPUT;
343}
344
345sub release_output_handler
346{
347 my $self = shift (@_);
348 my ($outhandler) = @_;
349
350 close($outhandler);
351
352}
353
354sub output_xml_header {
355 my $self = shift (@_);
356 my ($handle,$docroot,$nondoctype) = @_;
357
358 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
359
360 if (!defined $nondoctype){
361 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
362 }
363
364 print $handle "<$docroot>\n" if defined $docroot;
365}
366
367sub output_xml_footer {
368 my $self = shift (@_);
369 my ($handle,$docroot) = @_;
370 print $handle "</$docroot>\n" if defined $docroot;
371}
372
373sub process {
374 my $self = shift (@_);
375 my ($doc_obj) = @_;
376
377 $doc_obj->set_lastmodified();
378
379 if ($self->{'group_size'} > 1) {
380 $self->group_process ($doc_obj);
381 return;
382 }
383
384 my $OID = $doc_obj->get_OID();
385 $OID = "NULL" unless defined $OID;
386
387 my $top_section = $doc_obj->get_top_section();
388
389 #get document's directory
390 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
391
392 my $output_info = $self->{'output_info'};
393 return if (!defined $output_info);
394
395 ##############################
396 # call subclass' saveas method
397 ##############################
398 $self->saveas($doc_obj,$doc_dir);
399
400}
401
402sub store_output_info_reference {
403 my $self = shift (@_);
404 my ($doc_obj) = @_;
405
406 my $output_info = $self->{'output_info'};
407 my $metaname = $self->{'sortmeta'};
408 if (!defined $metaname || $metaname !~ /\S/) {
409 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
410 return;
411 }
412
413 my $metadata = "";
414 my $top_section = $doc_obj->get_top_section();
415
416 my @commameta_list = split(/,/, $metaname);
417 foreach my $cmn (@commameta_list) {
418 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
419 if ($meta) {
420 # do remove prefix/suffix - this will apply to all values
421 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
422 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
423 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
424 $metadata .= $meta if ($meta);
425 }
426 }
427
428 # store reference in the output_info
429 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
430
431}
432
433sub group_process {
434
435 my $self = shift (@_);
436 my ($doc_obj) = @_;
437
438 my $OID = $doc_obj->get_OID();
439 $OID = "NULL" unless defined $OID;
440
441 my $groupsize = $self->{'group_size'};
442 my $gs_count = $self->{'gs_count'};
443 my $open_new_file = (($gs_count % $groupsize)==0);
444 my $outhandle = $self->{'output_handle'};
445
446 # opening a new file, or document has assoicated files => directory needed
447 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
448
449 # The directory the archive file (doc.xml) and all associated files
450 # should end up in
451 my $doc_dir;
452 # If we've determined its time for a new file, open it now
453 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
454 {
455 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
456 # only if opening new file
457 my $output_dir = $self->get_output_dir();
458 &util::mk_all_dir ($output_dir) unless -e $output_dir;
459 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
460 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
461
462 if ($gs_count>0)
463 {
464 return if (!$self->close_file_output());
465 }
466
467 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasPlugout::group_process could not write to file $doc_file\n" and return);
468
469
470 $self->{'gs_filename'} = $doc_file;
471 $self->{'short_doc_file'} = $short_doc_file;
472 $self->{'gs_OID'} = $OID;
473 $self->{'gs_doc_dir'} = $doc_dir;
474
475 $self->output_xml_header('BasPlugout::GROUPPROCESS','Archive');
476 }
477 # Otherwise load the same archive document directory used last time
478 else
479 {
480 $doc_dir = $self->{'gs_doc_dir'};
481 }
482
483 # copy all the associated files, add this information as metadata
484 # to the document
485 print STDERR "Writing associated files to $doc_dir\n";
486 $self->process_assoc_files ($doc_obj, $doc_dir);
487 }
488
489 # save this document
490 $doc_obj->output_section('BasPlugout::GROUPPROCESS', $doc_obj->get_top_section());
491
492 $self->{'gs_count'}++;
493}
494
495
496sub saveas {
497 my $self = shift (@_);
498
499 die "Basplug::saveas function must be implemented in sub classes\n";
500}
501
502sub get_doc_dir {
503 my $self = shift (@_);
504 my ($OID, $source_filename) = @_;
505
506 my $working_dir = $self->get_output_dir();
507 my $working_info = $self->{output_info};
508 return if (!defined $working_info);
509
510 my $doc_info = $working_info->get_info($OID);
511 my $doc_dir = '';
512
513 if (defined $doc_info && scalar(@$doc_info) >= 1) {
514 # this OID already has an assigned directory, use the
515 # same one.
516 $doc_dir = $doc_info->[0];
517 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
518 } elsif ($self->{'keep_import_structure'}) {
519 $source_filename = &File::Basename::dirname($source_filename);
520 $source_filename =~ s/[\\\/]+/\//g;
521 $source_filename =~ s/\/$//;
522
523 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
524
525 }
526
527 # have to get a new document directory
528 $doc_dir = $self->get_new_doc_dir($working_info,$working_dir,$OID) unless $doc_dir ne "";
529
530 $doc_dir .= ".dir";
531 if (!defined $self->{'group'} || !$self->{'group'}){
532 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
533 }
534 return $doc_dir;
535}
536
537sub get_new_doc_dir{
538 my $self = shift (@_);
539 my($working_info,$working_dir,$OID) = @_;
540
541 my $doc_dir = "";
542 my $doc_dir_rest = $OID;
543 my $doc_dir_num = 0;
544
545 do {
546 $doc_dir .= "/" if $doc_dir_num > 0;
547 if ($doc_dir_rest =~ s/^(.{1,8})//) {
548 $doc_dir .= $1;
549 $doc_dir_num++;
550 }
551 } while ($doc_dir_rest ne "" &&
552 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
553 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
554
555
556 return $doc_dir;
557}
558
559sub process_assoc_files {
560 my $self = shift (@_);
561 my ($doc_obj, $doc_dir, $handle) = @_;
562
563 my $outhandle = $self->{'output_handle'};
564
565 my $output_dir = $self->get_output_dir();
566 return if (!defined $output_dir);
567
568 &util::mk_all_dir ($output_dir) unless -e $output_dir;
569
570 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
571 &util::mk_all_dir ($working_dir) unless -e $working_dir;
572
573 my @assoc_files = ();
574 my $filename;;
575
576 my $source_filename = $doc_obj->get_source_filename();
577
578 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
579
580 if (defined $collect_dir) {
581 my $dirsep_regexp = &util::get_os_dirsep();
582
583 if ($collect_dir !~ /$dirsep_regexp$/) {
584 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
585 }
586
587 # This test is never going to fail on Windows -- is this a problem?
588
589 if ($source_filename !~ /^$dirsep_regexp/) {
590 $source_filename = &util::filename_cat($collect_dir, $source_filename);
591 }
592 }
593
594
595 # set the assocfile path (even if we have no assoc files - need this for lucene)
596 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
597 "assocfilepath",
598 "$doc_dir");
599 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
600 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
601 $dir = "" unless defined $dir;
602
603
604 my $real_filename = $assoc_file_rec->[0];
605 # for some reasons the image associate file has / before the full path
606 $real_filename =~ s/^\\(.*)/$1/i;
607 if (-e $real_filename) {
608
609 $filename = &util::filename_cat($working_dir, $afile);
610
611 &util::hard_link ($real_filename, $filename);
612
613 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
614 "gsdlassocfile",
615 "$afile:$assoc_file_rec->[2]:$dir");
616 } elsif ($self->{'verbosity'} > 2) {
617 print $outhandle "BasPlugout::process couldn't copy the associated file " .
618 "$real_filename to $afile\n";
619 }
620 }
621}
622
623sub set_sortmeta {
624 my $self = shift (@_);
625 my ($sortmeta, $removeprefix, $removesuffix) = @_;
626
627 $self->{'sortmeta'} = $sortmeta;
628 if (defined ($removeprefix) && $removeprefix ) {
629 $removeprefix =~ s/^\^//; # don't need a leading ^
630 $self->{'removeprefix'} = $removeprefix;
631 }
632 if (defined ($removesuffix) && $removesuffix) {
633 $removesuffix =~ s/\$$//; # don't need a trailing $
634 $self->{'removesuffix'} = $removesuffix;
635 }
636}
637
638sub open_xslt_pipe
639{
640 my $self = shift @_;
641 my ($output_file_name, $xslt_file)=@_;
642
643 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
644
645 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java");
646
647 $java_class_path = "\"".$java_class_path."\"";
648
649 $xslt_file = "\"".$xslt_file."\"";
650
651 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT $xslt_file ";
652
653 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
654 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
655 $cmd .= $mapping_file_path;
656 }
657
658 open(*XMLWRITER, $cmd)
659 or die "can't open pipe to xslt: $!";
660
661
662 $self->{'xslt_writer'} = *XMLWRITER;
663
664 print XMLWRITER "<?DocStart?>\n";
665 print XMLWRITER "$output_file_name\n";
666
667 }
668
669
670sub close_xslt_pipe
671{
672 my $self = shift @_;
673
674
675 return unless defined $self->{'xslt_writer'} ;
676
677 my $xsltwriter = $self->{'xslt_writer'};
678
679 print $xsltwriter "<?DocEnd?>\n";
680 close($xsltwriter);
681
682 undef $self->{'xslt_writer'};
683
684}
685
686sub close_file_output
687{
688 my ($self) = @_;
689
690 # make sure that the handle has been opened - it won't be if we failed
691 # to import any documents...
692 if (defined(fileno(GROUPPROCESS))) {
693 $self->output_xml_footer('GROUPPROCESS','Archive');
694 close GROUPPROCESS;
695 }
696
697 my $OID = $self->{'gs_OID'};
698 my $short_doc_file = $self->{'short_doc_file'};
699
700 if ($self->{'gzip'}) {
701 my $doc_file = $self->{'gs_filename'};
702 `gzip $doc_file`;
703 $doc_file .= ".gz";
704 $short_doc_file .= ".gz";
705 if (!-e $doc_file) {
706 my $outhandle = $self->{'output_handle'};
707 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
708 return 0;
709 }
710 }
711
712 # store reference in output_info
713 my $output_info = $self->{'output_info'};
714 return 0 if (!defined $output_info);
715 $output_info->add_info($OID, $short_doc_file, undef, undef);
716 return 1;
717}
718
719#the subclass should implement this method if is_group method could return 1.
720sub close_group_output{
721 my $self = shift (@_);
722}
723
724sub is_group {
725 my $self = shift (@_);
726 return 0;
727}
728
7291;
Note: See TracBrowser for help on using the repository browser.