source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27499

Last change on this file since 27499 was 27499, checked in by jmt12, 11 years ago

New configuration options to control the creation of directories in the Archives directory... controlling the length of each split, and whether the string HASH should count against that limit

  • Property svn:keywords set to Author Date Id Revision
File size: 32.9 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "no",
96 'hiddengli' => "no"}
97];
98
99my $options = { 'name' => "BasePlugout",
100 'desc' => "{BasPlugout.desc}",
101 'abstract' => "yes",
102 'inherits' => "no",
103 'args' => $arguments};
104
105sub new
106{
107 my $class = shift (@_);
108
109 my ($plugoutlist,$args,$hashArgOptLists) = @_;
110 push(@$plugoutlist, $class);
111
112 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
113
114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
116
117 my $self = {};
118 $self->{'plugout_type'} = $class;
119 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
120 $self->{"info_only"} = 0;
121
122 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
123 # the args, just return the object.
124 foreach my $strArg (@{$args})
125 {
126 if(defined $strArg && $strArg eq "-gsdlinfo")
127 {
128 $self->{"info_only"} = 1;
129 return bless $self, $class;
130 }
131 }
132
133 delete $self->{"info_only"};
134
135 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
136 {
137 my $classTempClass = bless $self, $class;
138 print STDERR "<BadPlugout d=$plugout_name>\n";
139 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
140 $classTempClass->print_txt_usage(""); # Use default resource bundle
141 die "\n";
142 }
143
144
145 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
146 {
147 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
148 if (!defined $full_file_path) {
149 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
150 die "\n";
151 }
152 $self->{'xslt_file'} = $full_file_path;
153 }
154
155 $self->{'gs_count'} = 0;
156
157 $self->{'keep_import_structure'} = 0;
158
159 return bless $self, $class;
160
161}
162
163sub print_xml_usage
164{
165 my $self = shift(@_);
166 my $header = shift(@_);
167 my $high_level_information_only = shift(@_);
168
169 # XML output is always in UTF-8
170 gsprintf::output_strings_in_UTF8;
171
172 if ($header) {
173 &PrintUsage::print_xml_header("plugout");
174 }
175 $self->print_xml($high_level_information_only);
176}
177
178
179sub print_xml
180{
181 my $self = shift(@_);
182 my $high_level_information_only = shift(@_);
183
184 my $optionlistref = $self->{'option_list'};
185 my @optionlist = @$optionlistref;
186 my $plugoutoptions = shift(@$optionlistref);
187 return if (!defined($plugoutoptions));
188
189 gsprintf(STDERR, "<PlugoutInfo>\n");
190 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
191 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
192 $desc =~ s/</&amp;lt;/g; # doubly escaped
193 $desc =~ s/>/&amp;gt;/g;
194 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
195 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
196 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
197 unless (defined($high_level_information_only)) {
198 gsprintf(STDERR, " <Arguments>\n");
199 if (defined($plugoutoptions->{'args'})) {
200 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
201 }
202 gsprintf(STDERR, " </Arguments>\n");
203
204 # Recurse up the plugout hierarchy
205 $self->print_xml();
206 }
207 gsprintf(STDERR, "</PlugoutInfo>\n");
208}
209
210
211sub print_txt_usage
212{
213 my $self = shift(@_);
214
215 # Print the usage message for a plugout (recursively)
216 my $descoffset = $self->determine_description_offset(0);
217 $self->print_plugout_usage($descoffset, 1);
218}
219
220sub determine_description_offset
221{
222 my $self = shift(@_);
223 my $maxoffset = shift(@_);
224
225 my $optionlistref = $self->{'option_list'};
226 my @optionlist = @$optionlistref;
227 my $plugoutoptions = pop(@$optionlistref);
228 return $maxoffset if (!defined($plugoutoptions));
229
230 # Find the length of the longest option string of this download
231 my $plugoutargs = $plugoutoptions->{'args'};
232 if (defined($plugoutargs)) {
233 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
234 if ($longest > $maxoffset) {
235 $maxoffset = $longest;
236 }
237 }
238
239 # Recurse up the download hierarchy
240 $maxoffset = $self->determine_description_offset($maxoffset);
241 $self->{'option_list'} = \@optionlist;
242 return $maxoffset;
243}
244
245
246sub print_plugout_usage
247{
248 my $self = shift(@_);
249 my $descoffset = shift(@_);
250 my $isleafclass = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $plugoutoptions = shift(@$optionlistref);
255 return if (!defined($plugoutoptions));
256
257 my $plugoutname = $plugoutoptions->{'name'};
258 my $plugoutargs = $plugoutoptions->{'args'};
259 my $plugoutdesc = $plugoutoptions->{'desc'};
260
261 # Produce the usage information using the data structure above
262 if ($isleafclass) {
263 if (defined($plugoutdesc)) {
264 gsprintf(STDERR, "$plugoutdesc\n\n");
265 }
266 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
267 }
268
269 # Display the download options, if there are some
270 if (defined($plugoutargs)) {
271 # Calculate the column offset of the option descriptions
272 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
273
274 if ($isleafclass) {
275 gsprintf(STDERR, " {common.specific_options}:\n");
276 }
277 else {
278 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
279 }
280
281 # Display the download options
282 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
283 }
284
285 # Recurse up the download hierarchy
286 $self->print_plugout_usage($descoffset, 0);
287 $self->{'option_list'} = \@optionlist;
288}
289
290
291sub error
292{
293 my ($strFunctionName,$strError) = @_;
294 {
295 print "Error occoured in BasePlugout.pm\n".
296 "In Function: ".$strFunctionName."\n".
297 "Error Message: ".$strError."\n";
298 exit(-1);
299 }
300}
301
302# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
303sub set_OIDtype {
304 my $self = shift (@_);
305 my ($type, $metadata) = @_;
306
307 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
308 $self->{'OIDtype'} = $type;
309 } else {
310 $self->{'OIDtype'} = "hash";
311 }
312 if ($type =~ /^assigned$/) {
313 if (defined $metadata) {
314 $self->{'OIDmetadata'} = $metadata;
315 } else {
316 $self->{'OIDmetadata'} = "dc.Identifier";
317 }
318 }
319}
320
321sub set_output_dir
322{
323 my $self = shift @_;
324 my ($output_dir) = @_;
325
326 $self->{'output_dir'} = $output_dir;
327}
328
329sub setoutputdir
330{
331 my $self = shift @_;
332 my ($output_dir) = @_;
333
334 $self->{'output_dir'} = $output_dir;
335}
336
337sub get_output_dir
338{
339 my $self = shift (@_);
340
341 return $self->{'output_dir'};
342}
343
344sub getoutputdir
345{
346 my $self = shift (@_);
347
348 return $self->{'output_dir'};
349}
350
351sub getoutputinfo
352{
353 my $self = shift (@_);
354
355 return $self->{'output_info'};
356}
357
358
359sub get_output_handler
360{
361 my $self = shift (@_);
362
363 my ($output_file_name) = @_;
364
365 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
366
367 return *OUTPUT;
368}
369
370sub release_output_handler
371{
372 my $self = shift (@_);
373 my ($outhandler) = @_;
374
375 close($outhandler);
376
377}
378
379sub output_xml_header {
380 my $self = shift (@_);
381 my ($handle,$docroot,$nondoctype) = @_;
382
383
384 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
385
386 #For Dspace must be UTF in lower case
387 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
388
389 if (!defined $nondoctype){
390 my $doctype = (defined $docroot) ? $docroot : "Section";
391
392 # Used to be '<!DOCTYPE Archive SYSTEM ...'
393
394 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
395 }
396
397 print $handle "<$docroot>\n" if defined $docroot;
398}
399
400sub output_xml_footer {
401 my $self = shift (@_);
402 my ($handle,$docroot) = @_;
403 print $handle "</$docroot>\n" if defined $docroot;
404}
405
406
407sub output_general_xml_header
408{
409 my $self = shift (@_);
410 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
411
412 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
413
414 if (defined $opt_dtd) {
415 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
416 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
417 }
418
419 if (defined $docroot) {
420 my $full_docroot = $docroot;
421 if (defined $opt_attributes) {
422 $full_docroot .= " $opt_attributes";
423 }
424
425 print $handle "<$full_docroot>\n"
426 }
427}
428
429sub output_general_xml_footer
430{
431 output_xml_footer(@_);
432}
433
434
435sub process {
436 my $self = shift (@_);
437 my ($doc_obj) = @_;
438
439 # for OAI purposes
440 $doc_obj->set_lastmodified();
441 $doc_obj->set_oailastmodified();
442
443 if ($self->{'group_size'} > 1) {
444 $self->group_process ($doc_obj);
445 return;
446 }
447
448 my $OID = $doc_obj->get_OID();
449 $OID = "NULL" unless defined $OID;
450
451 my $top_section = $doc_obj->get_top_section();
452
453 #get document's directory
454 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
455
456 my $output_info = $self->{'output_info'};
457 return if (!defined $output_info);
458
459 ##############################
460 # call subclass' saveas method
461 ##############################
462 $self->saveas($doc_obj,$doc_dir);
463 $self->archiveinf_db($doc_obj,$doc_dir);
464
465}
466
467sub store_output_info_reference {
468 my $self = shift (@_);
469 my ($doc_obj) = @_;
470
471 my $output_info = $self->{'output_info'};
472 my $metaname = $self->{'sortmeta'};
473 if (!defined $metaname || $metaname !~ /\S/) {
474 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
475 return;
476 }
477
478 my $metadata = "";
479 my $top_section = $doc_obj->get_top_section();
480
481 my @commameta_list = split(/,/, $metaname);
482 foreach my $cmn (@commameta_list) {
483 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
484 if ($meta) {
485 # do remove prefix/suffix - this will apply to all values
486 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
487 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
488 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
489 $metadata .= $meta if ($meta);
490 }
491 }
492
493 # store reference in the output_info
494 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
495
496}
497
498sub group_process {
499
500 my $self = shift (@_);
501 my ($doc_obj) = @_;
502
503 my $OID = $doc_obj->get_OID();
504 $OID = "NULL" unless defined $OID;
505
506 my $groupsize = $self->{'group_size'};
507 my $gs_count = $self->{'gs_count'};
508 my $open_new_file = (($gs_count % $groupsize)==0);
509 my $outhandle = $self->{'output_handle'};
510
511 # opening a new file, or document has assoicated files => directory needed
512 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
513
514 # The directory the archive file (doc.xml) and all associated files
515 # should end up in
516 my $doc_dir;
517 # If we've determined its time for a new file, open it now
518 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
519 {
520 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
521 # only if opening new file
522 my $output_dir = $self->get_output_dir();
523 &util::mk_all_dir ($output_dir) unless -e $output_dir;
524 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
525 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
526
527 if ($gs_count>0)
528 {
529 return if (!$self->close_file_output());
530 }
531
532 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
533
534
535 $self->{'gs_filename'} = $doc_file;
536 $self->{'short_doc_file'} = $short_doc_file;
537 $self->{'gs_OID'} = $OID;
538 $self->{'gs_doc_dir'} = $doc_dir;
539
540 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
541 }
542 # Otherwise load the same archive document directory used last time
543 else
544 {
545 $doc_dir = $self->{'gs_doc_dir'};
546 }
547
548 # copy all the associated files, add this information as metadata
549 # to the document
550 print $outhandle "Writing associated files to $doc_dir\n";
551 $self->process_assoc_files ($doc_obj, $doc_dir);
552
553 # look up 'gsdlmetafile' metadata and store that information
554 # explicitly in $doc_obj
555 $self->process_metafiles_metadata ($doc_obj);
556 }
557
558 # save this document
559 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
560 print GROUPPROCESS $section_text;
561
562 $self->{'gs_count'}++;
563}
564
565
566sub saveas {
567 my $self = shift (@_);
568
569 die "Basplug::saveas function must be implemented in sub classes\n";
570}
571
572sub get_doc_dir {
573 my $self = shift (@_);
574 my ($OID, $source_filename) = @_;
575
576 my $working_dir = $self->get_output_dir();
577 my $working_info = $self->{'output_info'};
578 return if (!defined $working_info);
579
580 my $doc_info = $working_info->get_info($OID);
581 my $doc_dir = '';
582
583 if (defined $doc_info && scalar(@$doc_info) >= 1)
584 {
585 # This OID already has an archives directory, so use it again
586 $doc_dir = $doc_info->[0];
587 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
588 }
589 elsif ($self->{'keep_import_structure'})
590 {
591 $source_filename = &File::Basename::dirname($source_filename);
592 $source_filename =~ s/[\\\/]+/\//g;
593 $source_filename =~ s/\/$//;
594
595 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
596 }
597
598 # We have to use a new archives directory for this document
599 if ($doc_dir eq "")
600 {
601 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
602 }
603
604 if (!defined $self->{'group'} || !$self->{'group'}){
605 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
606 }
607
608 return $doc_dir;
609}
610
611sub get_new_doc_dir{
612 my $self = shift (@_);
613 my($working_info,$working_dir,$OID) = @_;
614
615
616 my $doc_dir = "";
617 my $doc_dir_rest = $OID;
618
619 # remove any \ and / from the OID
620 $doc_dir_rest =~ s/[\\\/]//g;
621
622 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
623 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
624
625 my $doc_dir_num = 0;
626
627 do {
628 $doc_dir .= "/" if $doc_dir_num > 0;
629 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
630 if ($self->{'subdir_hash_prefix'})
631 {
632 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
633 }
634 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
635 if ($doc_dir_rest =~ s/$pattern//i)
636 {
637 $doc_dir .= $1;
638 $doc_dir_num++;
639 }
640 } while ($doc_dir_rest ne "" &&
641 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
642 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
643 my $i = 1;
644 my $doc_dir_base = $doc_dir;
645 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
646 $doc_dir = "$doc_dir_base-$i";
647 $i++;
648 }
649
650 return "$doc_dir.dir";
651}
652
653sub process_assoc_files {
654 my $self = shift (@_);
655 my ($doc_obj, $doc_dir, $handle) = @_;
656
657 my $outhandle = $self->{'output_handle'};
658
659 my $output_dir = $self->get_output_dir();
660 return if (!defined $output_dir);
661
662 &util::mk_all_dir ($output_dir) unless -e $output_dir;
663
664 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
665 &util::mk_all_dir ($working_dir) unless -e $working_dir;
666
667 my @assoc_files = ();
668 my $filename;;
669
670 my $source_filename = $doc_obj->get_source_filename();
671
672 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
673
674 if (defined $collect_dir) {
675 my $dirsep_regexp = &util::get_os_dirsep();
676
677 if ($collect_dir !~ /$dirsep_regexp$/) {
678 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
679 }
680
681 # This test is never going to fail on Windows -- is this a problem?
682
683 if ($source_filename !~ /^$dirsep_regexp/) {
684 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
685 }
686 }
687
688
689 # set the assocfile path (even if we have no assoc files - need this for lucene)
690 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
691 "assocfilepath",
692 "$doc_dir");
693 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
694 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
695 $dir = "" unless defined $dir;
696
697 my $utf8_real_filename = $assoc_file_rec->[0];
698
699 # for some reasons the image associate file has / before the full path
700 $utf8_real_filename =~ s/^\\(.*)/$1/i;
701
702## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
703 my $real_filename = $utf8_real_filename;
704 $real_filename = &util::downgrade_if_dos_filename($real_filename);
705
706 if (-e $real_filename) {
707
708 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
709
710 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
711
712 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
713 "gsdlassocfile",
714 "$afile:$assoc_file_rec->[2]:$dir");
715 } elsif ($self->{'verbosity'} > 1) {
716 print $outhandle "BasePlugout::process couldn't copy the associated file " .
717 "$real_filename to $afile\n";
718 }
719 }
720}
721
722
723sub process_metafiles_metadata
724{
725 my $self = shift (@_);
726 my ($doc_obj) = @_;
727
728 my $top_section = $doc_obj->get_top_section();
729 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
730
731 foreach my $metafile_pair (@$metafiles) {
732 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
733
734 $doc_obj->metadata_file($full_metafile,$metafile);
735 }
736
737 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
738}
739
740sub archiveinf_files_to_field
741{
742 my $self = shift(@_);
743 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
744
745 foreach my $file_rec (@$files) {
746 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
747 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
748 # for some reasons the image associate file has / before the full path
749 $real_filename =~ s/^\\(.*)/$1/i;
750
751 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
752
753 if (-e $raw_filename) {
754
755# if (defined $collect_dir) {
756# my $collect_dir_re_safe = $collect_dir;
757# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
758# $collect_dir_re_safe =~ s/\./\\./g;##
759
760# $real_filename =~ s/^$collect_dir_re_safe//;
761# }
762
763 if (defined $reverse_lookups) {
764 $reverse_lookups->{$real_filename} = 1;
765 }
766### push(@{$oid_files->{$field}},$full_file);
767 push(@{$oid_files->{$field}},$raw_filename);
768 }
769 else {
770 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
771 }
772 }
773}
774
775sub archiveinf_db
776{
777 my $self = shift (@_);
778 my ($doc_obj) = @_;
779
780 my $verbosity = $self->{'verbosity'};
781
782 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
783 if (defined $collect_dir) {
784 my $dirsep_regexp = &util::get_os_dirsep();
785
786 if ($collect_dir !~ /$dirsep_regexp$/) {
787 # ensure there is a slash at the end
788 $collect_dir .= &util::get_dirsep();
789 }
790 }
791
792 my $oid = $doc_obj->get_OID();
793 my $source_filename = $doc_obj->get_unmodified_source_filename();
794 my $working_info = $self->{'output_info'};
795 my $doc_info = $working_info->get_info($oid);
796
797 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
798 # doc_file is the path to the archive doc.xml. Make sure it has unix
799 # slashes, then if the collection is copied to linux, it can be built without reimport
800 $doc_file =~ s/\\/\//g;
801 my $oid_files = { 'doc-file' => $doc_file,
802 'index-status' => $index_status,
803 'src-file' => $source_filename,
804 'sort-meta' => $sortmeta,
805 'assoc-file' => [],
806 'meta-file' => [] };
807
808 my $reverse_lookups = { $source_filename => "1" };
809
810
811 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
812 $collect_dir,$oid_files,$reverse_lookups);
813
814
815 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
816 $collect_dir,$oid_files);
817
818 # Get the infodbtype value for this collection from the arcinfo object
819 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
820 my $output_dir = $self->{'output_dir'};
821
822 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
823 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
824
825 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
826
827 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
828 my $top_section = $doc_obj->get_top_section();
829
830 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
831 my $dc_title = join("; ", @$dc_titles);
832
833 if ($oid_files->{'index-status'} eq "R") {
834 $dc_title .= " (Updated)";
835 }
836
837 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
838 if (open(RSSOUT,">>$rss_filename")) {
839 print RSSOUT "<item>\n";
840 print RSSOUT " <title>$dc_title</title>\n";
841 print RSSOUT " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
842 print RSSOUT "</item>\n";
843 close(RSSOUT);
844 }
845 else {
846 print STDERR "**** Failed to open $rss_filename\n!$\n";
847 }
848
849
850 }
851
852 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
853 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
854 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
855 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
856
857 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
858 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
859 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
860
861 foreach my $rl (keys %$reverse_lookups) {
862 $working_info->add_reverseinfo($rl,$oid);
863 }
864
865 # meta files not set in reverese entry, but need to set the metadata flag
866 if (defined $doc_obj->get_meta_files()) {
867 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
868 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
869 $working_info->set_meta_file_flag($full_file);
870 }
871 }
872}
873
874
875sub set_sortmeta {
876 my $self = shift (@_);
877 my ($sortmeta, $removeprefix, $removesuffix) = @_;
878
879 $self->{'sortmeta'} = $sortmeta;
880 if (defined ($removeprefix) && $removeprefix ) {
881 $removeprefix =~ s/^\^//; # don't need a leading ^
882 $self->{'removeprefix'} = $removeprefix;
883 }
884 if (defined ($removesuffix) && $removesuffix) {
885 $removesuffix =~ s/\$$//; # don't need a trailing $
886 $self->{'removesuffix'} = $removesuffix;
887 }
888}
889
890sub open_xslt_pipe
891{
892 my $self = shift @_;
893 my ($output_file_name, $xslt_file)=@_;
894
895 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
896
897 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
898
899 my $mapping_file_path = "";
900
901 if ($ENV{'GSDLOS'} eq "windows"){
902 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
903 # this file:/// bit didn't work for me on windows XP
904 #$xslt_file = "\"file:///".$xslt_file."\"";
905 #$mapping_file_path = "\"file:///";
906 }
907 else{
908 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
909 }
910
911
912 $java_class_path = "\"".$java_class_path."\"";
913
914 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
915
916 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
917 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
918 $cmd .= "-m $mapping_file_path";
919 }
920
921 open(*XMLWRITER, $cmd)
922 or die "can't open pipe to xslt: $!";
923
924
925 $self->{'xslt_writer'} = *XMLWRITER;
926
927 print XMLWRITER "<?DocStart?>\n";
928 print XMLWRITER "$output_file_name\n";
929
930
931 }
932
933
934sub close_xslt_pipe
935{
936 my $self = shift @_;
937
938
939 return unless defined $self->{'xslt_writer'} ;
940
941 my $xsltwriter = $self->{'xslt_writer'};
942
943 print $xsltwriter "<?DocEnd?>\n";
944 close($xsltwriter);
945
946 undef $self->{'xslt_writer'};
947
948}
949
950sub close_file_output
951{
952 my ($self) = @_;
953
954 # make sure that the handle has been opened - it won't be if we failed
955 # to import any documents...
956 if (defined(fileno(GROUPPROCESS))) {
957 $self->output_xml_footer('GROUPPROCESS','Archive');
958 close GROUPPROCESS;
959 }
960
961 my $OID = $self->{'gs_OID'};
962 my $short_doc_file = $self->{'short_doc_file'};
963
964 if ($self->{'gzip'}) {
965 my $doc_file = $self->{'gs_filename'};
966 `gzip $doc_file`;
967 $doc_file .= ".gz";
968 $short_doc_file .= ".gz";
969 if (!-e $doc_file) {
970 my $outhandle = $self->{'output_handle'};
971 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
972 return 0;
973 }
974 }
975
976 # store reference in output_info
977 my $output_info = $self->{'output_info'};
978 return 0 if (!defined $output_info);
979 $output_info->add_info($OID, $short_doc_file, undef, undef);
980 return 1;
981}
982
983
984#the subclass should implement this method if is_group method could return 1.
985sub close_group_output{
986 my $self = shift (@_);
987}
988
989sub is_group {
990 my $self = shift (@_);
991 return 0;
992}
993
994my $dc_set = { Title => 1,
995 Creator => 1,
996 Subject => 1,
997 Description => 1,
998 Publisher => 1,
999 Contributor => 1,
1000 Date => 1,
1001 Type => 1,
1002 Format => 1,
1003 Identifier => 1,
1004 Source => 1,
1005 Language => 1,
1006 Relation => 1,
1007 Coverage => 1,
1008 Rights => 1};
1009
1010
1011# returns an XML representation of the dublin core metadata
1012# if dc meta is not found, try ex meta
1013# This method is not used by the DSpacePlugout, which has its
1014# own method to save its dc metadata
1015sub get_dc_metadata {
1016 my $self = shift(@_);
1017 my ($doc_obj, $section, $version) = @_;
1018
1019 # build up string of dublin core metadata
1020 $section="" unless defined $section;
1021
1022 my $section_ptr = $doc_obj->_lookup_section($section);
1023 return "" unless defined $section_ptr;
1024
1025
1026 my $explicit_dc = {};
1027 my $explicit_ex_dc = {};
1028 my $explicit_ex = {};
1029
1030 my $all_text="";
1031
1032 # We want high quality dc metadata to go in first, so we store all the
1033 # assigned dc.* values first. Then, for all those dc metadata names in
1034 # the official dc set that are as yet unassigned, we look to see whether
1035 # embedded ex.dc.* metadata has defined some values for them. If not,
1036 # then for the same missing dc metadata names, we look in ex metadata.
1037
1038 foreach my $data (@{$section_ptr->{'metadata'}}){
1039 my $escaped_value = &docprint::escape_text($data->[1]);
1040 if ($data->[0]=~ m/^dc\./) {
1041 $data->[0] =~ tr/[A-Z]/[a-z]/;
1042
1043 $data->[0] =~ m/^dc\.(.*)/;
1044 my $dc_element = $1;
1045
1046 if (!defined $explicit_dc->{$dc_element}) {
1047 $explicit_dc->{$dc_element} = [];
1048 }
1049 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1050
1051 if (defined $version && ($version eq "oai_dc")) {
1052 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1053 }
1054 else {
1055 # qualifier???
1056 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1057 }
1058
1059 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1060 $data->[0] =~ m/^ex\.dc\.(.*)/;
1061 my $ex_dc_element = $1;
1062 my $lc_ex_dc_element = lc($ex_dc_element);
1063
1064 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1065 if (defined $dc_set->{$ex_dc_element}) {
1066 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1067 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1068 }
1069 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1070 }
1071 }
1072 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1073 $data->[0] =~ m/^(ex\.)?(.*)/;
1074 my $ex_element = $2;
1075 my $lc_ex_element = lc($ex_element);
1076
1077 if (defined $dc_set->{$ex_element}) {
1078 if (!defined $explicit_ex->{$lc_ex_element}) {
1079 $explicit_ex->{$lc_ex_element} = [];
1080 }
1081 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1082 }
1083 }
1084 }
1085
1086 # go through dc_set and for any element *not* defined in explicit_dc
1087 # that does exist in explicit_ex, add it in as metadata
1088 foreach my $k ( keys %$dc_set ) {
1089 my $lc_k = lc($k);
1090
1091 if (!defined $explicit_dc->{$lc_k}) {
1092 # try to find if ex.dc.* defines this dc.* meta,
1093 # if not, then look for whether there's an ex.* equivalent
1094
1095 if (defined $explicit_ex_dc->{$lc_k}) {
1096 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1097 my $dc_element = $lc_k;
1098 my $escaped_value = $v;
1099
1100 if (defined $version && ($version eq "oai_dc")) {
1101 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1102 }
1103 else {
1104 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1105 }
1106 }
1107 } elsif (defined $explicit_ex->{$lc_k}) {
1108 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1109 my $dc_element = $lc_k;
1110 my $escaped_value = $v;
1111
1112 if (defined $version && ($version eq "oai_dc")) {
1113 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1114 }
1115 else {
1116 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1117 }
1118 }
1119 }
1120 }
1121 }
1122
1123 if ($all_text eq "") {
1124 $all_text .= " There is no Dublin Core metatdata in this document\n";
1125 }
1126 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1127
1128 return $all_text;
1129}
1130
1131# Build up dublin_core metadata. Priority given to dc.* over ex.*
1132# This method was apparently added by Jeffrey and committed by Shaoqun.
1133# But we don't know why it was added, so not using it anymore.
1134sub new_get_dc_metadata {
1135
1136 my $self = shift(@_);
1137 my ($doc_obj, $section, $version) = @_;
1138
1139 # build up string of dublin core metadata
1140 $section="" unless defined $section;
1141
1142 my $section_ptr=$doc_obj->_lookup_section($section);
1143 return "" unless defined $section_ptr;
1144
1145 my $all_text = "";
1146 foreach my $data (@{$section_ptr->{'metadata'}}){
1147 my $escaped_value = &docprint::escape_text($data->[1]);
1148 my $dc_element = $data->[0];
1149
1150 my @array = split('\.',$dc_element);
1151 my ($type,$name);
1152
1153 if(defined $array[1])
1154 {
1155 $type = $array[0];
1156 $name = $array[1];
1157 }
1158 else
1159 {
1160 $type = "ex";
1161 $name = $array[0];
1162 }
1163
1164 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1165 }
1166 return $all_text;
1167}
1168
1169
11701;
Note: See TracBrowser for help on using the repository browser.