source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27501

Last change on this file since 27501 was 27501, checked in by jmt12, 11 years ago

Missed (another) old style file open that instead needs to go through FileUtils::openFileHandle() - this time for the doomed RSS file (which won't work properly in the HDFS collections due to the current lack of proper file append)

  • Property svn:keywords set to Author Date Id Revision
File size: 33.0 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "no",
96 'hiddengli' => "no"}
97];
98
99my $options = { 'name' => "BasePlugout",
100 'desc' => "{BasPlugout.desc}",
101 'abstract' => "yes",
102 'inherits' => "no",
103 'args' => $arguments};
104
105sub new
106{
107 my $class = shift (@_);
108
109 my ($plugoutlist,$args,$hashArgOptLists) = @_;
110 push(@$plugoutlist, $class);
111
112 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
113
114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
116
117 my $self = {};
118 $self->{'plugout_type'} = $class;
119 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
120 $self->{"info_only"} = 0;
121
122 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
123 # the args, just return the object.
124 foreach my $strArg (@{$args})
125 {
126 if(defined $strArg && $strArg eq "-gsdlinfo")
127 {
128 $self->{"info_only"} = 1;
129 return bless $self, $class;
130 }
131 }
132
133 delete $self->{"info_only"};
134
135 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
136 {
137 my $classTempClass = bless $self, $class;
138 print STDERR "<BadPlugout d=$plugout_name>\n";
139 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
140 $classTempClass->print_txt_usage(""); # Use default resource bundle
141 die "\n";
142 }
143
144
145 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
146 {
147 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
148 if (!defined $full_file_path) {
149 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
150 die "\n";
151 }
152 $self->{'xslt_file'} = $full_file_path;
153 }
154
155 $self->{'gs_count'} = 0;
156
157 $self->{'keep_import_structure'} = 0;
158
159 return bless $self, $class;
160
161}
162
163sub print_xml_usage
164{
165 my $self = shift(@_);
166 my $header = shift(@_);
167 my $high_level_information_only = shift(@_);
168
169 # XML output is always in UTF-8
170 gsprintf::output_strings_in_UTF8;
171
172 if ($header) {
173 &PrintUsage::print_xml_header("plugout");
174 }
175 $self->print_xml($high_level_information_only);
176}
177
178
179sub print_xml
180{
181 my $self = shift(@_);
182 my $high_level_information_only = shift(@_);
183
184 my $optionlistref = $self->{'option_list'};
185 my @optionlist = @$optionlistref;
186 my $plugoutoptions = shift(@$optionlistref);
187 return if (!defined($plugoutoptions));
188
189 gsprintf(STDERR, "<PlugoutInfo>\n");
190 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
191 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
192 $desc =~ s/</&amp;lt;/g; # doubly escaped
193 $desc =~ s/>/&amp;gt;/g;
194 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
195 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
196 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
197 unless (defined($high_level_information_only)) {
198 gsprintf(STDERR, " <Arguments>\n");
199 if (defined($plugoutoptions->{'args'})) {
200 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
201 }
202 gsprintf(STDERR, " </Arguments>\n");
203
204 # Recurse up the plugout hierarchy
205 $self->print_xml();
206 }
207 gsprintf(STDERR, "</PlugoutInfo>\n");
208}
209
210
211sub print_txt_usage
212{
213 my $self = shift(@_);
214
215 # Print the usage message for a plugout (recursively)
216 my $descoffset = $self->determine_description_offset(0);
217 $self->print_plugout_usage($descoffset, 1);
218}
219
220sub determine_description_offset
221{
222 my $self = shift(@_);
223 my $maxoffset = shift(@_);
224
225 my $optionlistref = $self->{'option_list'};
226 my @optionlist = @$optionlistref;
227 my $plugoutoptions = pop(@$optionlistref);
228 return $maxoffset if (!defined($plugoutoptions));
229
230 # Find the length of the longest option string of this download
231 my $plugoutargs = $plugoutoptions->{'args'};
232 if (defined($plugoutargs)) {
233 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
234 if ($longest > $maxoffset) {
235 $maxoffset = $longest;
236 }
237 }
238
239 # Recurse up the download hierarchy
240 $maxoffset = $self->determine_description_offset($maxoffset);
241 $self->{'option_list'} = \@optionlist;
242 return $maxoffset;
243}
244
245
246sub print_plugout_usage
247{
248 my $self = shift(@_);
249 my $descoffset = shift(@_);
250 my $isleafclass = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $plugoutoptions = shift(@$optionlistref);
255 return if (!defined($plugoutoptions));
256
257 my $plugoutname = $plugoutoptions->{'name'};
258 my $plugoutargs = $plugoutoptions->{'args'};
259 my $plugoutdesc = $plugoutoptions->{'desc'};
260
261 # Produce the usage information using the data structure above
262 if ($isleafclass) {
263 if (defined($plugoutdesc)) {
264 gsprintf(STDERR, "$plugoutdesc\n\n");
265 }
266 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
267 }
268
269 # Display the download options, if there are some
270 if (defined($plugoutargs)) {
271 # Calculate the column offset of the option descriptions
272 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
273
274 if ($isleafclass) {
275 gsprintf(STDERR, " {common.specific_options}:\n");
276 }
277 else {
278 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
279 }
280
281 # Display the download options
282 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
283 }
284
285 # Recurse up the download hierarchy
286 $self->print_plugout_usage($descoffset, 0);
287 $self->{'option_list'} = \@optionlist;
288}
289
290
291sub error
292{
293 my ($strFunctionName,$strError) = @_;
294 {
295 print "Error occoured in BasePlugout.pm\n".
296 "In Function: ".$strFunctionName."\n".
297 "Error Message: ".$strError."\n";
298 exit(-1);
299 }
300}
301
302# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
303sub set_OIDtype {
304 my $self = shift (@_);
305 my ($type, $metadata) = @_;
306
307 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
308 $self->{'OIDtype'} = $type;
309 } else {
310 $self->{'OIDtype'} = "hash";
311 }
312 if ($type =~ /^assigned$/) {
313 if (defined $metadata) {
314 $self->{'OIDmetadata'} = $metadata;
315 } else {
316 $self->{'OIDmetadata'} = "dc.Identifier";
317 }
318 }
319}
320
321sub set_output_dir
322{
323 my $self = shift @_;
324 my ($output_dir) = @_;
325
326 $self->{'output_dir'} = $output_dir;
327}
328
329sub setoutputdir
330{
331 my $self = shift @_;
332 my ($output_dir) = @_;
333
334 $self->{'output_dir'} = $output_dir;
335}
336
337sub get_output_dir
338{
339 my $self = shift (@_);
340
341 return $self->{'output_dir'};
342}
343
344sub getoutputdir
345{
346 my $self = shift (@_);
347
348 return $self->{'output_dir'};
349}
350
351sub getoutputinfo
352{
353 my $self = shift (@_);
354
355 return $self->{'output_info'};
356}
357
358
359sub get_output_handler
360{
361 my $self = shift (@_);
362
363 my ($output_file_name) = @_;
364
365 my $fh;
366 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
367
368 return $fh;
369}
370
371sub release_output_handler
372{
373 my $self = shift (@_);
374 my ($outhandler) = @_;
375
376 close($outhandler);
377
378}
379
380sub output_xml_header {
381 my $self = shift (@_);
382 my ($handle,$docroot,$nondoctype) = @_;
383
384
385 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
386
387 #For Dspace must be UTF in lower case
388 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
389
390 if (!defined $nondoctype){
391 my $doctype = (defined $docroot) ? $docroot : "Section";
392
393 # Used to be '<!DOCTYPE Archive SYSTEM ...'
394
395 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
396 }
397
398 print $handle "<$docroot>\n" if defined $docroot;
399}
400
401sub output_xml_footer {
402 my $self = shift (@_);
403 my ($handle,$docroot) = @_;
404 print $handle "</$docroot>\n" if defined $docroot;
405}
406
407
408sub output_general_xml_header
409{
410 my $self = shift (@_);
411 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
412
413 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
414
415 if (defined $opt_dtd) {
416 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
417 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
418 }
419
420 if (defined $docroot) {
421 my $full_docroot = $docroot;
422 if (defined $opt_attributes) {
423 $full_docroot .= " $opt_attributes";
424 }
425
426 print $handle "<$full_docroot>\n"
427 }
428}
429
430sub output_general_xml_footer
431{
432 output_xml_footer(@_);
433}
434
435
436sub process {
437 my $self = shift (@_);
438 my ($doc_obj) = @_;
439
440 # for OAI purposes
441 $doc_obj->set_lastmodified();
442 $doc_obj->set_oailastmodified();
443
444 if ($self->{'group_size'} > 1) {
445 $self->group_process ($doc_obj);
446 return;
447 }
448
449 my $OID = $doc_obj->get_OID();
450 $OID = "NULL" unless defined $OID;
451
452 my $top_section = $doc_obj->get_top_section();
453
454 #get document's directory
455 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
456
457 my $output_info = $self->{'output_info'};
458 return if (!defined $output_info);
459
460 ##############################
461 # call subclass' saveas method
462 ##############################
463 $self->saveas($doc_obj,$doc_dir);
464 $self->archiveinf_db($doc_obj,$doc_dir);
465
466}
467
468sub store_output_info_reference {
469 my $self = shift (@_);
470 my ($doc_obj) = @_;
471
472 my $output_info = $self->{'output_info'};
473 my $metaname = $self->{'sortmeta'};
474 if (!defined $metaname || $metaname !~ /\S/) {
475 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
476 return;
477 }
478
479 my $metadata = "";
480 my $top_section = $doc_obj->get_top_section();
481
482 my @commameta_list = split(/,/, $metaname);
483 foreach my $cmn (@commameta_list) {
484 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
485 if ($meta) {
486 # do remove prefix/suffix - this will apply to all values
487 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
488 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
489 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
490 $metadata .= $meta if ($meta);
491 }
492 }
493
494 # store reference in the output_info
495 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
496
497}
498
499sub group_process {
500
501 my $self = shift (@_);
502 my ($doc_obj) = @_;
503
504 my $OID = $doc_obj->get_OID();
505 $OID = "NULL" unless defined $OID;
506
507 my $groupsize = $self->{'group_size'};
508 my $gs_count = $self->{'gs_count'};
509 my $open_new_file = (($gs_count % $groupsize)==0);
510 my $outhandle = $self->{'output_handle'};
511
512 # opening a new file, or document has assoicated files => directory needed
513 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
514
515 # The directory the archive file (doc.xml) and all associated files
516 # should end up in
517 my $doc_dir;
518 # If we've determined its time for a new file, open it now
519 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
520 {
521 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
522 # only if opening new file
523 my $output_dir = $self->get_output_dir();
524 &FileUtils::makeAllDirectories($output_dir) unless -e $output_dir;
525 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
526 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
527
528 if ($gs_count>0)
529 {
530 return if (!$self->close_file_output());
531 }
532
533 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
534
535
536 $self->{'gs_filename'} = $doc_file;
537 $self->{'short_doc_file'} = $short_doc_file;
538 $self->{'gs_OID'} = $OID;
539 $self->{'gs_doc_dir'} = $doc_dir;
540
541 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
542 }
543 # Otherwise load the same archive document directory used last time
544 else
545 {
546 $doc_dir = $self->{'gs_doc_dir'};
547 }
548
549 # copy all the associated files, add this information as metadata
550 # to the document
551 print $outhandle "Writing associated files to $doc_dir\n";
552 $self->process_assoc_files ($doc_obj, $doc_dir);
553
554 # look up 'gsdlmetafile' metadata and store that information
555 # explicitly in $doc_obj
556 $self->process_metafiles_metadata ($doc_obj);
557 }
558
559 # save this document
560 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
561 print GROUPPROCESS $section_text;
562
563 $self->{'gs_count'}++;
564}
565
566
567sub saveas {
568 my $self = shift (@_);
569
570 die "Basplug::saveas function must be implemented in sub classes\n";
571}
572
573sub get_doc_dir {
574 my $self = shift (@_);
575 my ($OID, $source_filename) = @_;
576
577 my $working_dir = $self->get_output_dir();
578 my $working_info = $self->{'output_info'};
579 return if (!defined $working_info);
580
581 my $doc_info = $working_info->get_info($OID);
582 my $doc_dir = '';
583
584 if (defined $doc_info && scalar(@$doc_info) >= 1)
585 {
586 # This OID already has an archives directory, so use it again
587 $doc_dir = $doc_info->[0];
588 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
589 }
590 elsif ($self->{'keep_import_structure'})
591 {
592 $source_filename = &File::Basename::dirname($source_filename);
593 $source_filename =~ s/[\\\/]+/\//g;
594 $source_filename =~ s/\/$//;
595
596 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
597 }
598
599 # We have to use a new archives directory for this document
600 if ($doc_dir eq "")
601 {
602 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
603 }
604
605 if (!defined $self->{'group'} || !$self->{'group'}){
606 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
607 }
608
609 return $doc_dir;
610}
611
612sub get_new_doc_dir{
613 my $self = shift (@_);
614 my($working_info,$working_dir,$OID) = @_;
615
616
617 my $doc_dir = "";
618 my $doc_dir_rest = $OID;
619
620 # remove any \ and / from the OID
621 $doc_dir_rest =~ s/[\\\/]//g;
622
623 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
624 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
625
626 my $doc_dir_num = 0;
627
628 do {
629 $doc_dir .= "/" if $doc_dir_num > 0;
630 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
631 if ($self->{'subdir_hash_prefix'})
632 {
633 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
634 }
635 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
636 if ($doc_dir_rest =~ s/$pattern//i)
637 {
638 $doc_dir .= $1;
639 $doc_dir_num++;
640 }
641 } while ($doc_dir_rest ne "" &&
642 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
643 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
644 my $i = 1;
645 my $doc_dir_base = $doc_dir;
646 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
647 $doc_dir = "$doc_dir_base-$i";
648 $i++;
649 }
650
651 return "$doc_dir.dir";
652}
653
654sub process_assoc_files {
655 my $self = shift (@_);
656 my ($doc_obj, $doc_dir, $handle) = @_;
657
658 my $outhandle = $self->{'output_handle'};
659
660 my $output_dir = $self->get_output_dir();
661 return if (!defined $output_dir);
662
663 &FileUtils::makeAllDirectories($output_dir) unless -e $output_dir;
664
665 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
666 &FileUtils::makeAllDirectories($working_dir) unless -e $working_dir;
667
668 my @assoc_files = ();
669 my $filename;;
670
671 my $source_filename = $doc_obj->get_source_filename();
672
673 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
674
675 if (defined $collect_dir) {
676 my $dirsep_regexp = &util::get_os_dirsep();
677
678 if ($collect_dir !~ /$dirsep_regexp$/) {
679 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
680 }
681
682 # This test is never going to fail on Windows -- is this a problem?
683
684 if ($source_filename !~ /^$dirsep_regexp/) {
685 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
686 }
687 }
688
689
690 # set the assocfile path (even if we have no assoc files - need this for lucene)
691 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
692 "assocfilepath",
693 "$doc_dir");
694 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
695 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
696 $dir = "" unless defined $dir;
697
698 my $utf8_real_filename = $assoc_file_rec->[0];
699
700 # for some reasons the image associate file has / before the full path
701 $utf8_real_filename =~ s/^\\(.*)/$1/i;
702
703## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
704 my $real_filename = $utf8_real_filename;
705 $real_filename = &util::downgrade_if_dos_filename($real_filename);
706
707 if (-e $real_filename) {
708
709 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
710
711 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
712
713 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
714 "gsdlassocfile",
715 "$afile:$assoc_file_rec->[2]:$dir");
716 } elsif ($self->{'verbosity'} > 1) {
717 print $outhandle "BasePlugout::process couldn't copy the associated file " .
718 "$real_filename to $afile\n";
719 }
720 }
721}
722
723
724sub process_metafiles_metadata
725{
726 my $self = shift (@_);
727 my ($doc_obj) = @_;
728
729 my $top_section = $doc_obj->get_top_section();
730 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
731
732 foreach my $metafile_pair (@$metafiles) {
733 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
734
735 $doc_obj->metadata_file($full_metafile,$metafile);
736 }
737
738 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
739}
740
741sub archiveinf_files_to_field
742{
743 my $self = shift(@_);
744 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
745
746 foreach my $file_rec (@$files) {
747 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
748 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
749 # for some reasons the image associate file has / before the full path
750 $real_filename =~ s/^\\(.*)/$1/i;
751
752 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
753
754 if (-e $raw_filename) {
755
756# if (defined $collect_dir) {
757# my $collect_dir_re_safe = $collect_dir;
758# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
759# $collect_dir_re_safe =~ s/\./\\./g;##
760
761# $real_filename =~ s/^$collect_dir_re_safe//;
762# }
763
764 if (defined $reverse_lookups) {
765 $reverse_lookups->{$real_filename} = 1;
766 }
767### push(@{$oid_files->{$field}},$full_file);
768 push(@{$oid_files->{$field}},$raw_filename);
769 }
770 else {
771 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
772 }
773 }
774}
775
776sub archiveinf_db
777{
778 my $self = shift (@_);
779 my ($doc_obj) = @_;
780
781 my $verbosity = $self->{'verbosity'};
782
783 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
784 if (defined $collect_dir) {
785 my $dirsep_regexp = &util::get_os_dirsep();
786
787 if ($collect_dir !~ /$dirsep_regexp$/) {
788 # ensure there is a slash at the end
789 $collect_dir .= &util::get_dirsep();
790 }
791 }
792
793 my $oid = $doc_obj->get_OID();
794 my $source_filename = $doc_obj->get_unmodified_source_filename();
795 my $working_info = $self->{'output_info'};
796 my $doc_info = $working_info->get_info($oid);
797
798 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
799 # doc_file is the path to the archive doc.xml. Make sure it has unix
800 # slashes, then if the collection is copied to linux, it can be built without reimport
801 $doc_file =~ s/\\/\//g;
802 my $oid_files = { 'doc-file' => $doc_file,
803 'index-status' => $index_status,
804 'src-file' => $source_filename,
805 'sort-meta' => $sortmeta,
806 'assoc-file' => [],
807 'meta-file' => [] };
808
809 my $reverse_lookups = { $source_filename => "1" };
810
811
812 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
813 $collect_dir,$oid_files,$reverse_lookups);
814
815
816 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
817 $collect_dir,$oid_files);
818
819 # Get the infodbtype value for this collection from the arcinfo object
820 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
821 my $output_dir = $self->{'output_dir'};
822
823 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
824 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
825
826 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
827
828 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
829 my $top_section = $doc_obj->get_top_section();
830
831 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
832 my $dc_title = join("; ", @$dc_titles);
833
834 if ($oid_files->{'index-status'} eq "R") {
835 $dc_title .= " (Updated)";
836 }
837
838 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
839 my $rss_fh;
840 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh)) {
841 print $rss_fh "<item>\n";
842 print $rss_fh " <title>$dc_title</title>\n";
843 print $rss_fh " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
844 print $rss_fh "</item>\n";
845 &FileUtils::closeFileHandle($rss_fh);
846 }
847 else {
848 print STDERR "**** Failed to open $rss_filename\n!$\n";
849 }
850
851
852 }
853
854 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
855 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
856 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
857 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
858
859 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
860 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
861 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
862
863 foreach my $rl (keys %$reverse_lookups) {
864 $working_info->add_reverseinfo($rl,$oid);
865 }
866
867 # meta files not set in reverese entry, but need to set the metadata flag
868 if (defined $doc_obj->get_meta_files()) {
869 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
870 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
871 $working_info->set_meta_file_flag($full_file);
872 }
873 }
874}
875
876
877sub set_sortmeta {
878 my $self = shift (@_);
879 my ($sortmeta, $removeprefix, $removesuffix) = @_;
880
881 $self->{'sortmeta'} = $sortmeta;
882 if (defined ($removeprefix) && $removeprefix ) {
883 $removeprefix =~ s/^\^//; # don't need a leading ^
884 $self->{'removeprefix'} = $removeprefix;
885 }
886 if (defined ($removesuffix) && $removesuffix) {
887 $removesuffix =~ s/\$$//; # don't need a trailing $
888 $self->{'removesuffix'} = $removesuffix;
889 }
890}
891
892sub open_xslt_pipe
893{
894 my $self = shift @_;
895 my ($output_file_name, $xslt_file)=@_;
896
897 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
898
899 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
900
901 my $mapping_file_path = "";
902
903 if ($ENV{'GSDLOS'} eq "windows"){
904 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
905 # this file:/// bit didn't work for me on windows XP
906 #$xslt_file = "\"file:///".$xslt_file."\"";
907 #$mapping_file_path = "\"file:///";
908 }
909 else{
910 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
911 }
912
913
914 $java_class_path = "\"".$java_class_path."\"";
915
916 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
917
918 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
919 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
920 $cmd .= "-m $mapping_file_path";
921 }
922
923 open(*XMLWRITER, $cmd)
924 or die "can't open pipe to xslt: $!";
925
926
927 $self->{'xslt_writer'} = *XMLWRITER;
928
929 print XMLWRITER "<?DocStart?>\n";
930 print XMLWRITER "$output_file_name\n";
931
932
933 }
934
935
936sub close_xslt_pipe
937{
938 my $self = shift @_;
939
940
941 return unless defined $self->{'xslt_writer'} ;
942
943 my $xsltwriter = $self->{'xslt_writer'};
944
945 print $xsltwriter "<?DocEnd?>\n";
946 close($xsltwriter);
947
948 undef $self->{'xslt_writer'};
949
950}
951
952sub close_file_output
953{
954 my ($self) = @_;
955
956 # make sure that the handle has been opened - it won't be if we failed
957 # to import any documents...
958 if (defined(fileno(GROUPPROCESS))) {
959 $self->output_xml_footer('GROUPPROCESS','Archive');
960 close GROUPPROCESS;
961 }
962
963 my $OID = $self->{'gs_OID'};
964 my $short_doc_file = $self->{'short_doc_file'};
965
966 if ($self->{'gzip'}) {
967 my $doc_file = $self->{'gs_filename'};
968 `gzip $doc_file`;
969 $doc_file .= ".gz";
970 $short_doc_file .= ".gz";
971 if (!-e $doc_file) {
972 my $outhandle = $self->{'output_handle'};
973 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
974 return 0;
975 }
976 }
977
978 # store reference in output_info
979 my $output_info = $self->{'output_info'};
980 return 0 if (!defined $output_info);
981 $output_info->add_info($OID, $short_doc_file, undef, undef);
982 return 1;
983}
984
985
986#the subclass should implement this method if is_group method could return 1.
987sub close_group_output{
988 my $self = shift (@_);
989}
990
991sub is_group {
992 my $self = shift (@_);
993 return 0;
994}
995
996my $dc_set = { Title => 1,
997 Creator => 1,
998 Subject => 1,
999 Description => 1,
1000 Publisher => 1,
1001 Contributor => 1,
1002 Date => 1,
1003 Type => 1,
1004 Format => 1,
1005 Identifier => 1,
1006 Source => 1,
1007 Language => 1,
1008 Relation => 1,
1009 Coverage => 1,
1010 Rights => 1};
1011
1012
1013# returns an XML representation of the dublin core metadata
1014# if dc meta is not found, try ex meta
1015# This method is not used by the DSpacePlugout, which has its
1016# own method to save its dc metadata
1017sub get_dc_metadata {
1018 my $self = shift(@_);
1019 my ($doc_obj, $section, $version) = @_;
1020
1021 # build up string of dublin core metadata
1022 $section="" unless defined $section;
1023
1024 my $section_ptr = $doc_obj->_lookup_section($section);
1025 return "" unless defined $section_ptr;
1026
1027
1028 my $explicit_dc = {};
1029 my $explicit_ex_dc = {};
1030 my $explicit_ex = {};
1031
1032 my $all_text="";
1033
1034 # We want high quality dc metadata to go in first, so we store all the
1035 # assigned dc.* values first. Then, for all those dc metadata names in
1036 # the official dc set that are as yet unassigned, we look to see whether
1037 # embedded ex.dc.* metadata has defined some values for them. If not,
1038 # then for the same missing dc metadata names, we look in ex metadata.
1039
1040 foreach my $data (@{$section_ptr->{'metadata'}}){
1041 my $escaped_value = &docprint::escape_text($data->[1]);
1042 if ($data->[0]=~ m/^dc\./) {
1043 $data->[0] =~ tr/[A-Z]/[a-z]/;
1044
1045 $data->[0] =~ m/^dc\.(.*)/;
1046 my $dc_element = $1;
1047
1048 if (!defined $explicit_dc->{$dc_element}) {
1049 $explicit_dc->{$dc_element} = [];
1050 }
1051 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1052
1053 if (defined $version && ($version eq "oai_dc")) {
1054 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1055 }
1056 else {
1057 # qualifier???
1058 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1059 }
1060
1061 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1062 $data->[0] =~ m/^ex\.dc\.(.*)/;
1063 my $ex_dc_element = $1;
1064 my $lc_ex_dc_element = lc($ex_dc_element);
1065
1066 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1067 if (defined $dc_set->{$ex_dc_element}) {
1068 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1069 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1070 }
1071 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1072 }
1073 }
1074 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1075 $data->[0] =~ m/^(ex\.)?(.*)/;
1076 my $ex_element = $2;
1077 my $lc_ex_element = lc($ex_element);
1078
1079 if (defined $dc_set->{$ex_element}) {
1080 if (!defined $explicit_ex->{$lc_ex_element}) {
1081 $explicit_ex->{$lc_ex_element} = [];
1082 }
1083 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1084 }
1085 }
1086 }
1087
1088 # go through dc_set and for any element *not* defined in explicit_dc
1089 # that does exist in explicit_ex, add it in as metadata
1090 foreach my $k ( keys %$dc_set ) {
1091 my $lc_k = lc($k);
1092
1093 if (!defined $explicit_dc->{$lc_k}) {
1094 # try to find if ex.dc.* defines this dc.* meta,
1095 # if not, then look for whether there's an ex.* equivalent
1096
1097 if (defined $explicit_ex_dc->{$lc_k}) {
1098 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1099 my $dc_element = $lc_k;
1100 my $escaped_value = $v;
1101
1102 if (defined $version && ($version eq "oai_dc")) {
1103 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1104 }
1105 else {
1106 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1107 }
1108 }
1109 } elsif (defined $explicit_ex->{$lc_k}) {
1110 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1111 my $dc_element = $lc_k;
1112 my $escaped_value = $v;
1113
1114 if (defined $version && ($version eq "oai_dc")) {
1115 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1116 }
1117 else {
1118 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1119 }
1120 }
1121 }
1122 }
1123 }
1124
1125 if ($all_text eq "") {
1126 $all_text .= " There is no Dublin Core metatdata in this document\n";
1127 }
1128 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1129
1130 return $all_text;
1131}
1132
1133# Build up dublin_core metadata. Priority given to dc.* over ex.*
1134# This method was apparently added by Jeffrey and committed by Shaoqun.
1135# But we don't know why it was added, so not using it anymore.
1136sub new_get_dc_metadata {
1137
1138 my $self = shift(@_);
1139 my ($doc_obj, $section, $version) = @_;
1140
1141 # build up string of dublin core metadata
1142 $section="" unless defined $section;
1143
1144 my $section_ptr=$doc_obj->_lookup_section($section);
1145 return "" unless defined $section_ptr;
1146
1147 my $all_text = "";
1148 foreach my $data (@{$section_ptr->{'metadata'}}){
1149 my $escaped_value = &docprint::escape_text($data->[1]);
1150 my $dc_element = $data->[0];
1151
1152 my @array = split('\.',$dc_element);
1153 my ($type,$name);
1154
1155 if(defined $array[1])
1156 {
1157 $type = $array[0];
1158 $name = $array[1];
1159 }
1160 else
1161 {
1162 $type = "ex";
1163 $name = $array[0];
1164 }
1165
1166 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1167 }
1168 return $all_text;
1169}
1170
1171
11721;
Note: See TracBrowser for help on using the repository browser.