source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27071

Last change on this file since 27071 was 27071, checked in by ak19, 11 years ago

Demo collection uses dc.Title metadata now instead of dls.Title. So setting the code that generates the rss-items.rdf file to look for dc.Title metadata. Useful for collections in general, as many will be using dc metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 32.0 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45 { 'name' => "group_size",
46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'deft' => "",
61 'hiddengli' => "no"},
62 { 'name' => "output_handle",
63 'desc' => "{BasPlugout.output_handle}",
64 'type' => "string",
65 'deft' => 'STDERR',
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "verbosity",
69 'desc' => "{BasPlugout.verbosity}",
70 'type' => "int",
71 'deft' => "0",
72 'reqd' => "no",
73 'hiddengli' => "no"},
74 { 'name' => "gzip_output",
75 'desc' => "{BasPlugout.gzip_output}",
76 'type' => "flag",
77 'reqd' => "no",
78 'hiddengli' => "no"},
79 { 'name' => "debug",
80 'desc' => "{BasPlugout.debug}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes"}
84];
85
86my $options = { 'name' => "BasePlugout",
87 'desc' => "{BasPlugout.desc}",
88 'abstract' => "yes",
89 'inherits' => "no",
90 'args' => $arguments};
91
92sub new
93{
94 my $class = shift (@_);
95
96 my ($plugoutlist,$args,$hashArgOptLists) = @_;
97 push(@$plugoutlist, $class);
98
99 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
103
104 my $self = {};
105 $self->{'plugout_type'} = $class;
106 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107 $self->{"info_only"} = 0;
108
109 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110 # the args, just return the object.
111 foreach my $strArg (@{$args})
112 {
113 if(defined $strArg && $strArg eq "-gsdlinfo")
114 {
115 $self->{"info_only"} = 1;
116 return bless $self, $class;
117 }
118 }
119
120 delete $self->{"info_only"};
121
122 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123 {
124 my $classTempClass = bless $self, $class;
125 print STDERR "<BadPlugout d=$plugout_name>\n";
126 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127 $classTempClass->print_txt_usage(""); # Use default resource bundle
128 die "\n";
129 }
130
131
132 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
133 {
134 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135 if (!defined $full_file_path) {
136 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137 die "\n";
138 }
139 $self->{'xslt_file'} = $full_file_path;
140 }
141
142 $self->{'gs_count'} = 0;
143
144 $self->{'keep_import_structure'} = 0;
145
146 return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152 my $self = shift(@_);
153 my $header = shift(@_);
154 my $high_level_information_only = shift(@_);
155
156 # XML output is always in UTF-8
157 gsprintf::output_strings_in_UTF8;
158
159 if ($header) {
160 &PrintUsage::print_xml_header("plugout");
161 }
162 $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168 my $self = shift(@_);
169 my $high_level_information_only = shift(@_);
170
171 my $optionlistref = $self->{'option_list'};
172 my @optionlist = @$optionlistref;
173 my $plugoutoptions = shift(@$optionlistref);
174 return if (!defined($plugoutoptions));
175
176 gsprintf(STDERR, "<PlugoutInfo>\n");
177 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
178 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179 $desc =~ s/</&amp;lt;/g; # doubly escaped
180 $desc =~ s/>/&amp;gt;/g;
181 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184 unless (defined($high_level_information_only)) {
185 gsprintf(STDERR, " <Arguments>\n");
186 if (defined($plugoutoptions->{'args'})) {
187 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188 }
189 gsprintf(STDERR, " </Arguments>\n");
190
191 # Recurse up the plugout hierarchy
192 $self->print_xml();
193 }
194 gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200 my $self = shift(@_);
201
202 # Print the usage message for a plugout (recursively)
203 my $descoffset = $self->determine_description_offset(0);
204 $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209 my $self = shift(@_);
210 my $maxoffset = shift(@_);
211
212 my $optionlistref = $self->{'option_list'};
213 my @optionlist = @$optionlistref;
214 my $plugoutoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($plugoutoptions));
216
217 # Find the length of the longest option string of this download
218 my $plugoutargs = $plugoutoptions->{'args'};
219 if (defined($plugoutargs)) {
220 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the download hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235 my $self = shift(@_);
236 my $descoffset = shift(@_);
237 my $isleafclass = shift(@_);
238
239 my $optionlistref = $self->{'option_list'};
240 my @optionlist = @$optionlistref;
241 my $plugoutoptions = shift(@$optionlistref);
242 return if (!defined($plugoutoptions));
243
244 my $plugoutname = $plugoutoptions->{'name'};
245 my $plugoutargs = $plugoutoptions->{'args'};
246 my $plugoutdesc = $plugoutoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugoutdesc)) {
251 gsprintf(STDERR, "$plugoutdesc\n\n");
252 }
253 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254 }
255
256 # Display the download options, if there are some
257 if (defined($plugoutargs)) {
258 # Calculate the column offset of the option descriptions
259 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266 }
267
268 # Display the download options
269 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270 }
271
272 # Recurse up the download hierarchy
273 $self->print_plugout_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280 my ($strFunctionName,$strError) = @_;
281 {
282 print "Error occoured in BasePlugout.pm\n".
283 "In Function: ".$strFunctionName."\n".
284 "Error Message: ".$strError."\n";
285 exit(-1);
286 }
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291 my $self = shift (@_);
292 my ($type, $metadata) = @_;
293
294 if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) {
295 $self->{'OIDtype'} = $type;
296 } else {
297 $self->{'OIDtype'} = "hash";
298 }
299 if ($type =~ /^assigned$/) {
300 if (defined $metadata) {
301 $self->{'OIDmetadata'} = $metadata;
302 } else {
303 $self->{'OIDmetadata'} = "dc.Identifier";
304 }
305 }
306}
307
308sub set_output_dir
309{
310 my $self = shift @_;
311 my ($output_dir) = @_;
312
313 $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318 my $self = shift @_;
319 my ($output_dir) = @_;
320
321 $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326 my $self = shift (@_);
327
328 return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333 my $self = shift (@_);
334
335 return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340 my $self = shift (@_);
341
342 return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348 my $self = shift (@_);
349
350 my ($output_file_name) = @_;
351
352 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
353
354 return *OUTPUT;
355}
356
357sub release_output_handler
358{
359 my $self = shift (@_);
360 my ($outhandler) = @_;
361
362 close($outhandler);
363
364}
365
366sub output_xml_header {
367 my $self = shift (@_);
368 my ($handle,$docroot,$nondoctype) = @_;
369
370
371 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372
373 #For Dspace must be UTF in lower case
374 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
375
376 if (!defined $nondoctype){
377 my $doctype = (defined $docroot) ? $docroot : "Section";
378
379 # Used to be '<!DOCTYPE Archive SYSTEM ...'
380
381 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
382 }
383
384 print $handle "<$docroot>\n" if defined $docroot;
385}
386
387sub output_xml_footer {
388 my $self = shift (@_);
389 my ($handle,$docroot) = @_;
390 print $handle "</$docroot>\n" if defined $docroot;
391}
392
393
394sub output_general_xml_header
395{
396 my $self = shift (@_);
397 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
398
399 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
400
401 if (defined $opt_dtd) {
402 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
403 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
404 }
405
406 if (defined $docroot) {
407 my $full_docroot = $docroot;
408 if (defined $opt_attributes) {
409 $full_docroot .= " $opt_attributes";
410 }
411
412 print $handle "<$full_docroot>\n"
413 }
414}
415
416sub output_general_xml_footer
417{
418 output_xml_footer(@_);
419}
420
421
422sub process {
423 my $self = shift (@_);
424 my ($doc_obj) = @_;
425
426 # for OAI purposes
427 $doc_obj->set_lastmodified();
428 $doc_obj->set_oailastmodified();
429
430 if ($self->{'group_size'} > 1) {
431 $self->group_process ($doc_obj);
432 return;
433 }
434
435 my $OID = $doc_obj->get_OID();
436 $OID = "NULL" unless defined $OID;
437
438 my $top_section = $doc_obj->get_top_section();
439
440 #get document's directory
441 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
442
443 my $output_info = $self->{'output_info'};
444 return if (!defined $output_info);
445
446 ##############################
447 # call subclass' saveas method
448 ##############################
449 $self->saveas($doc_obj,$doc_dir);
450 $self->archiveinf_db($doc_obj,$doc_dir);
451
452}
453
454sub store_output_info_reference {
455 my $self = shift (@_);
456 my ($doc_obj) = @_;
457
458 my $output_info = $self->{'output_info'};
459 my $metaname = $self->{'sortmeta'};
460 if (!defined $metaname || $metaname !~ /\S/) {
461 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
462 return;
463 }
464
465 my $metadata = "";
466 my $top_section = $doc_obj->get_top_section();
467
468 my @commameta_list = split(/,/, $metaname);
469 foreach my $cmn (@commameta_list) {
470 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
471 if ($meta) {
472 # do remove prefix/suffix - this will apply to all values
473 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
474 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
475 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
476 $metadata .= $meta if ($meta);
477 }
478 }
479
480 # store reference in the output_info
481 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
482
483}
484
485sub group_process {
486
487 my $self = shift (@_);
488 my ($doc_obj) = @_;
489
490 my $OID = $doc_obj->get_OID();
491 $OID = "NULL" unless defined $OID;
492
493 my $groupsize = $self->{'group_size'};
494 my $gs_count = $self->{'gs_count'};
495 my $open_new_file = (($gs_count % $groupsize)==0);
496 my $outhandle = $self->{'output_handle'};
497
498 # opening a new file, or document has assoicated files => directory needed
499 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
500
501 # The directory the archive file (doc.xml) and all associated files
502 # should end up in
503 my $doc_dir;
504 # If we've determined its time for a new file, open it now
505 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
506 {
507 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
508 # only if opening new file
509 my $output_dir = $self->get_output_dir();
510 &util::mk_all_dir ($output_dir) unless -e $output_dir;
511 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
512 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
513
514 if ($gs_count>0)
515 {
516 return if (!$self->close_file_output());
517 }
518
519 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
520
521
522 $self->{'gs_filename'} = $doc_file;
523 $self->{'short_doc_file'} = $short_doc_file;
524 $self->{'gs_OID'} = $OID;
525 $self->{'gs_doc_dir'} = $doc_dir;
526
527 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
528 }
529 # Otherwise load the same archive document directory used last time
530 else
531 {
532 $doc_dir = $self->{'gs_doc_dir'};
533 }
534
535 # copy all the associated files, add this information as metadata
536 # to the document
537 print $outhandle "Writing associated files to $doc_dir\n";
538 $self->process_assoc_files ($doc_obj, $doc_dir);
539
540 # look up 'gsdlmetafile' metadata and store that information
541 # explicitly in $doc_obj
542 $self->process_metafiles_metadata ($doc_obj);
543 }
544
545 # save this document
546 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
547 print GROUPPROCESS $section_text;
548
549 $self->{'gs_count'}++;
550}
551
552
553sub saveas {
554 my $self = shift (@_);
555
556 die "Basplug::saveas function must be implemented in sub classes\n";
557}
558
559sub get_doc_dir {
560 my $self = shift (@_);
561 my ($OID, $source_filename) = @_;
562
563 my $working_dir = $self->get_output_dir();
564 my $working_info = $self->{'output_info'};
565 return if (!defined $working_info);
566
567 my $doc_info = $working_info->get_info($OID);
568 my $doc_dir = '';
569
570 if (defined $doc_info && scalar(@$doc_info) >= 1)
571 {
572 # This OID already has an archives directory, so use it again
573 $doc_dir = $doc_info->[0];
574 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
575 }
576 elsif ($self->{'keep_import_structure'})
577 {
578 $source_filename = &File::Basename::dirname($source_filename);
579 $source_filename =~ s/[\\\/]+/\//g;
580 $source_filename =~ s/\/$//;
581
582 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
583 }
584
585 # We have to use a new archives directory for this document
586 if ($doc_dir eq "")
587 {
588 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
589 }
590
591 if (!defined $self->{'group'} || !$self->{'group'}){
592 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
593 }
594
595 return $doc_dir;
596}
597
598sub get_new_doc_dir{
599 my $self = shift (@_);
600 my($working_info,$working_dir,$OID) = @_;
601
602
603 my $doc_dir = "";
604 my $doc_dir_rest = $OID;
605
606 # remove any \ and / from the OID
607 $doc_dir_rest =~ s/[\\\/]//g;
608
609 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
610 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
611
612 my $doc_dir_num = 0;
613
614 do {
615 $doc_dir .= "/" if $doc_dir_num > 0;
616 if ($doc_dir_rest =~ s/^(.{1,8})//) {
617 $doc_dir .= $1;
618 $doc_dir_num++;
619 }
620 } while ($doc_dir_rest ne "" &&
621 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
622 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
623 my $i = 1;
624 my $doc_dir_base = $doc_dir;
625 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
626 $doc_dir = "$doc_dir_base-$i";
627 $i++;
628 }
629
630 return "$doc_dir.dir";
631}
632
633sub process_assoc_files {
634 my $self = shift (@_);
635 my ($doc_obj, $doc_dir, $handle) = @_;
636
637 my $outhandle = $self->{'output_handle'};
638
639 my $output_dir = $self->get_output_dir();
640 return if (!defined $output_dir);
641
642 &util::mk_all_dir ($output_dir) unless -e $output_dir;
643
644 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
645 &util::mk_all_dir ($working_dir) unless -e $working_dir;
646
647 my @assoc_files = ();
648 my $filename;;
649
650 my $source_filename = $doc_obj->get_source_filename();
651
652 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
653
654 if (defined $collect_dir) {
655 my $dirsep_regexp = &util::get_os_dirsep();
656
657 if ($collect_dir !~ /$dirsep_regexp$/) {
658 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
659 }
660
661 # This test is never going to fail on Windows -- is this a problem?
662
663 if ($source_filename !~ /^$dirsep_regexp/) {
664 $source_filename = &util::filename_cat($collect_dir, $source_filename);
665 }
666 }
667
668
669 # set the assocfile path (even if we have no assoc files - need this for lucene)
670 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
671 "assocfilepath",
672 "$doc_dir");
673 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
674 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
675 $dir = "" unless defined $dir;
676
677 my $utf8_real_filename = $assoc_file_rec->[0];
678
679 # for some reasons the image associate file has / before the full path
680 $utf8_real_filename =~ s/^\\(.*)/$1/i;
681
682## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
683 my $real_filename = $utf8_real_filename;
684 $real_filename = &util::downgrade_if_dos_filename($real_filename);
685
686 if (-e $real_filename) {
687
688 $filename = &util::filename_cat($working_dir, $afile);
689
690 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
691
692 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
693 "gsdlassocfile",
694 "$afile:$assoc_file_rec->[2]:$dir");
695 } elsif ($self->{'verbosity'} > 1) {
696 print $outhandle "BasePlugout::process couldn't copy the associated file " .
697 "$real_filename to $afile\n";
698 }
699 }
700}
701
702
703sub process_metafiles_metadata
704{
705 my $self = shift (@_);
706 my ($doc_obj) = @_;
707
708 my $top_section = $doc_obj->get_top_section();
709 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
710
711 foreach my $metafile_pair (@$metafiles) {
712 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
713
714 $doc_obj->metadata_file($full_metafile,$metafile);
715 }
716
717 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
718}
719
720sub archiveinf_files_to_field
721{
722 my $self = shift(@_);
723 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
724
725 foreach my $file_rec (@$files) {
726 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
727 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
728 # for some reasons the image associate file has / before the full path
729 $real_filename =~ s/^\\(.*)/$1/i;
730
731 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
732
733 if (-e $raw_filename) {
734
735# if (defined $collect_dir) {
736# my $collect_dir_re_safe = $collect_dir;
737# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
738# $collect_dir_re_safe =~ s/\./\\./g;##
739
740# $real_filename =~ s/^$collect_dir_re_safe//;
741# }
742
743 if (defined $reverse_lookups) {
744 $reverse_lookups->{$real_filename} = 1;
745 }
746### push(@{$oid_files->{$field}},$full_file);
747 push(@{$oid_files->{$field}},$raw_filename);
748 }
749 else {
750 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
751 }
752 }
753}
754
755sub archiveinf_db
756{
757 my $self = shift (@_);
758 my ($doc_obj) = @_;
759
760 my $verbosity = $self->{'verbosity'};
761
762 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
763 if (defined $collect_dir) {
764 my $dirsep_regexp = &util::get_os_dirsep();
765
766 if ($collect_dir !~ /$dirsep_regexp$/) {
767 # ensure there is a slash at the end
768 $collect_dir .= &util::get_dirsep();
769 }
770 }
771
772 my $oid = $doc_obj->get_OID();
773 my $source_filename = $doc_obj->get_unmodified_source_filename();
774 my $working_info = $self->{'output_info'};
775 my $doc_info = $working_info->get_info($oid);
776
777 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
778 # doc_file is the path to the archive doc.xml. Make sure it has unix
779 # slashes, then if the collection is copied to linux, it can be built without reimport
780 $doc_file =~ s/\\/\//g;
781 my $oid_files = { 'doc-file' => $doc_file,
782 'index-status' => $index_status,
783 'src-file' => $source_filename,
784 'sort-meta' => $sortmeta,
785 'assoc-file' => [],
786 'meta-file' => [] };
787
788 my $reverse_lookups = { $source_filename => "1" };
789
790
791 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
792 $collect_dir,$oid_files,$reverse_lookups);
793
794
795 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
796 $collect_dir,$oid_files);
797
798 # Get the infodbtype value for this collection from the arcinfo object
799 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
800 my $output_dir = $self->{'output_dir'};
801
802 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
803 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
804
805 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
806
807 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
808 my $top_section = $doc_obj->get_top_section();
809
810 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
811 my $dc_title = join("; ", @$dc_titles);
812
813 if ($oid_files->{'index-status'} eq "R") {
814 $dc_title .= " (Updated)";
815 }
816
817 my $rss_filename = &util::filename_cat($output_dir,"rss-items.rdf");
818 if (open(RSSOUT,">>$rss_filename")) {
819 print RSSOUT "<item>\n";
820 print RSSOUT " <title>$dc_title</title>\n";
821 print RSSOUT " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
822 print RSSOUT "</item>\n";
823 close(RSSOUT);
824 }
825 else {
826 print STDERR "**** Failed to open $rss_filename\n!$\n";
827 }
828
829
830 }
831
832 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
833 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
834 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
835 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
836
837 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
838 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
839 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
840
841 foreach my $rl (keys %$reverse_lookups) {
842 $working_info->add_reverseinfo($rl,$oid);
843 }
844
845 # meta files not set in reverese entry, but need to set the metadata flag
846 if (defined $doc_obj->get_meta_files()) {
847 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
848 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
849 $working_info->set_meta_file_flag($full_file);
850 }
851 }
852}
853
854
855sub set_sortmeta {
856 my $self = shift (@_);
857 my ($sortmeta, $removeprefix, $removesuffix) = @_;
858
859 $self->{'sortmeta'} = $sortmeta;
860 if (defined ($removeprefix) && $removeprefix ) {
861 $removeprefix =~ s/^\^//; # don't need a leading ^
862 $self->{'removeprefix'} = $removeprefix;
863 }
864 if (defined ($removesuffix) && $removesuffix) {
865 $removesuffix =~ s/\$$//; # don't need a trailing $
866 $self->{'removesuffix'} = $removesuffix;
867 }
868}
869
870sub open_xslt_pipe
871{
872 my $self = shift @_;
873 my ($output_file_name, $xslt_file)=@_;
874
875 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
876
877 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
878
879 my $mapping_file_path = "";
880
881 if ($ENV{'GSDLOS'} eq "windows"){
882 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
883 # this file:/// bit didn't work for me on windows XP
884 #$xslt_file = "\"file:///".$xslt_file."\"";
885 #$mapping_file_path = "\"file:///";
886 }
887 else{
888 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
889 }
890
891
892 $java_class_path = "\"".$java_class_path."\"";
893
894 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
895
896 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
897 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
898 $cmd .= "-m $mapping_file_path";
899 }
900
901 open(*XMLWRITER, $cmd)
902 or die "can't open pipe to xslt: $!";
903
904
905 $self->{'xslt_writer'} = *XMLWRITER;
906
907 print XMLWRITER "<?DocStart?>\n";
908 print XMLWRITER "$output_file_name\n";
909
910
911 }
912
913
914sub close_xslt_pipe
915{
916 my $self = shift @_;
917
918
919 return unless defined $self->{'xslt_writer'} ;
920
921 my $xsltwriter = $self->{'xslt_writer'};
922
923 print $xsltwriter "<?DocEnd?>\n";
924 close($xsltwriter);
925
926 undef $self->{'xslt_writer'};
927
928}
929
930sub close_file_output
931{
932 my ($self) = @_;
933
934 # make sure that the handle has been opened - it won't be if we failed
935 # to import any documents...
936 if (defined(fileno(GROUPPROCESS))) {
937 $self->output_xml_footer('GROUPPROCESS','Archive');
938 close GROUPPROCESS;
939 }
940
941 my $OID = $self->{'gs_OID'};
942 my $short_doc_file = $self->{'short_doc_file'};
943
944 if ($self->{'gzip'}) {
945 my $doc_file = $self->{'gs_filename'};
946 `gzip $doc_file`;
947 $doc_file .= ".gz";
948 $short_doc_file .= ".gz";
949 if (!-e $doc_file) {
950 my $outhandle = $self->{'output_handle'};
951 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
952 return 0;
953 }
954 }
955
956 # store reference in output_info
957 my $output_info = $self->{'output_info'};
958 return 0 if (!defined $output_info);
959 $output_info->add_info($OID, $short_doc_file, undef, undef);
960 return 1;
961}
962
963
964#the subclass should implement this method if is_group method could return 1.
965sub close_group_output{
966 my $self = shift (@_);
967}
968
969sub is_group {
970 my $self = shift (@_);
971 return 0;
972}
973
974my $dc_set = { Title => 1,
975 Creator => 1,
976 Subject => 1,
977 Description => 1,
978 Publisher => 1,
979 Contributor => 1,
980 Date => 1,
981 Type => 1,
982 Format => 1,
983 Identifier => 1,
984 Source => 1,
985 Language => 1,
986 Relation => 1,
987 Coverage => 1,
988 Rights => 1};
989
990
991# returns an XML representation of the dublin core metadata
992# if dc meta is not found, try ex meta
993# This method is not used by the DSpacePlugout, which has its
994# own method to save its dc metadata
995sub get_dc_metadata {
996 my $self = shift(@_);
997 my ($doc_obj, $section, $version) = @_;
998
999 # build up string of dublin core metadata
1000 $section="" unless defined $section;
1001
1002 my $section_ptr = $doc_obj->_lookup_section($section);
1003 return "" unless defined $section_ptr;
1004
1005
1006 my $explicit_dc = {};
1007 my $explicit_ex_dc = {};
1008 my $explicit_ex = {};
1009
1010 my $all_text="";
1011
1012 # We want high quality dc metadata to go in first, so we store all the
1013 # assigned dc.* values first. Then, for all those dc metadata names in
1014 # the official dc set that are as yet unassigned, we look to see whether
1015 # embedded ex.dc.* metadata has defined some values for them. If not,
1016 # then for the same missing dc metadata names, we look in ex metadata.
1017
1018 foreach my $data (@{$section_ptr->{'metadata'}}){
1019 my $escaped_value = &docprint::escape_text($data->[1]);
1020 if ($data->[0]=~ m/^dc\./) {
1021 $data->[0] =~ tr/[A-Z]/[a-z]/;
1022
1023 $data->[0] =~ m/^dc\.(.*)/;
1024 my $dc_element = $1;
1025
1026 if (!defined $explicit_dc->{$dc_element}) {
1027 $explicit_dc->{$dc_element} = [];
1028 }
1029 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1030
1031 if (defined $version && ($version eq "oai_dc")) {
1032 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1033 }
1034 else {
1035 # qualifier???
1036 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1037 }
1038
1039 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1040 $data->[0] =~ m/^ex\.dc\.(.*)/;
1041 my $ex_dc_element = $1;
1042 my $lc_ex_dc_element = lc($ex_dc_element);
1043
1044 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1045 if (defined $dc_set->{$ex_dc_element}) {
1046 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1047 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1048 }
1049 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1050 }
1051 }
1052 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1053 $data->[0] =~ m/^(ex\.)?(.*)/;
1054 my $ex_element = $2;
1055 my $lc_ex_element = lc($ex_element);
1056
1057 if (defined $dc_set->{$ex_element}) {
1058 if (!defined $explicit_ex->{$lc_ex_element}) {
1059 $explicit_ex->{$lc_ex_element} = [];
1060 }
1061 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1062 }
1063 }
1064 }
1065
1066 # go through dc_set and for any element *not* defined in explicit_dc
1067 # that does exist in explicit_ex, add it in as metadata
1068 foreach my $k ( keys %$dc_set ) {
1069 my $lc_k = lc($k);
1070
1071 if (!defined $explicit_dc->{$lc_k}) {
1072 # try to find if ex.dc.* defines this dc.* meta,
1073 # if not, then look for whether there's an ex.* equivalent
1074
1075 if (defined $explicit_ex_dc->{$lc_k}) {
1076 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1077 my $dc_element = $lc_k;
1078 my $escaped_value = $v;
1079
1080 if (defined $version && ($version eq "oai_dc")) {
1081 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1082 }
1083 else {
1084 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1085 }
1086 }
1087 } elsif (defined $explicit_ex->{$lc_k}) {
1088 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1089 my $dc_element = $lc_k;
1090 my $escaped_value = $v;
1091
1092 if (defined $version && ($version eq "oai_dc")) {
1093 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1094 }
1095 else {
1096 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1097 }
1098 }
1099 }
1100 }
1101 }
1102
1103 if ($all_text eq "") {
1104 $all_text .= " There is no Dublin Core metatdata in this document\n";
1105 }
1106 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1107
1108 return $all_text;
1109}
1110
1111# Build up dublin_core metadata. Priority given to dc.* over ex.*
1112# This method was apparently added by Jeffrey and committed by Shaoqun.
1113# But we don't know why it was added, so not using it anymore.
1114sub new_get_dc_metadata {
1115
1116 my $self = shift(@_);
1117 my ($doc_obj, $section, $version) = @_;
1118
1119 # build up string of dublin core metadata
1120 $section="" unless defined $section;
1121
1122 my $section_ptr=$doc_obj->_lookup_section($section);
1123 return "" unless defined $section_ptr;
1124
1125 my $all_text = "";
1126 foreach my $data (@{$section_ptr->{'metadata'}}){
1127 my $escaped_value = &docprint::escape_text($data->[1]);
1128 my $dc_element = $data->[0];
1129
1130 my @array = split('\.',$dc_element);
1131 my ($type,$name);
1132
1133 if(defined $array[1])
1134 {
1135 $type = $array[0];
1136 $name = $array[1];
1137 }
1138 else
1139 {
1140 $type = "ex";
1141 $name = $array[0];
1142 }
1143
1144 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1145 }
1146 return $all_text;
1147}
1148
1149
11501;
Note: See TracBrowser for help on using the repository browser.