source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 32.2 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"}
85];
86
87my $options = { 'name' => "BasePlugout",
88 'desc' => "{BasPlugout.desc}",
89 'abstract' => "yes",
90 'inherits' => "no",
91 'args' => $arguments};
92
93sub new
94{
95 my $class = shift (@_);
96
97 my ($plugoutlist,$args,$hashArgOptLists) = @_;
98 push(@$plugoutlist, $class);
99
100 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
101
102 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
103 push(@{$hashArgOptLists->{"OptList"}},$options);
104
105 my $self = {};
106 $self->{'plugout_type'} = $class;
107 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
108 $self->{"info_only"} = 0;
109
110 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
111 # the args, just return the object.
112 foreach my $strArg (@{$args})
113 {
114 if(defined $strArg && $strArg eq "-gsdlinfo")
115 {
116 $self->{"info_only"} = 1;
117 return bless $self, $class;
118 }
119 }
120
121 delete $self->{"info_only"};
122
123 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
124 {
125 my $classTempClass = bless $self, $class;
126 print STDERR "<BadPlugout d=$plugout_name>\n";
127 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
128 $classTempClass->print_txt_usage(""); # Use default resource bundle
129 die "\n";
130 }
131
132
133 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
134 {
135 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
136 if (!defined $full_file_path) {
137 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
138 die "\n";
139 }
140 $self->{'xslt_file'} = $full_file_path;
141 }
142
143 $self->{'gs_count'} = 0;
144
145 $self->{'keep_import_structure'} = 0;
146
147 return bless $self, $class;
148
149}
150
151sub print_xml_usage
152{
153 my $self = shift(@_);
154 my $header = shift(@_);
155 my $high_level_information_only = shift(@_);
156
157 # XML output is always in UTF-8
158 gsprintf::output_strings_in_UTF8;
159
160 if ($header) {
161 &PrintUsage::print_xml_header("plugout");
162 }
163 $self->print_xml($high_level_information_only);
164}
165
166
167sub print_xml
168{
169 my $self = shift(@_);
170 my $high_level_information_only = shift(@_);
171
172 my $optionlistref = $self->{'option_list'};
173 my @optionlist = @$optionlistref;
174 my $plugoutoptions = shift(@$optionlistref);
175 return if (!defined($plugoutoptions));
176
177 gsprintf(STDERR, "<PlugoutInfo>\n");
178 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
179 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
180 $desc =~ s/</&amp;lt;/g; # doubly escaped
181 $desc =~ s/>/&amp;gt;/g;
182 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
183 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
184 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
185 unless (defined($high_level_information_only)) {
186 gsprintf(STDERR, " <Arguments>\n");
187 if (defined($plugoutoptions->{'args'})) {
188 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
189 }
190 gsprintf(STDERR, " </Arguments>\n");
191
192 # Recurse up the plugout hierarchy
193 $self->print_xml();
194 }
195 gsprintf(STDERR, "</PlugoutInfo>\n");
196}
197
198
199sub print_txt_usage
200{
201 my $self = shift(@_);
202
203 # Print the usage message for a plugout (recursively)
204 my $descoffset = $self->determine_description_offset(0);
205 $self->print_plugout_usage($descoffset, 1);
206}
207
208sub determine_description_offset
209{
210 my $self = shift(@_);
211 my $maxoffset = shift(@_);
212
213 my $optionlistref = $self->{'option_list'};
214 my @optionlist = @$optionlistref;
215 my $plugoutoptions = pop(@$optionlistref);
216 return $maxoffset if (!defined($plugoutoptions));
217
218 # Find the length of the longest option string of this download
219 my $plugoutargs = $plugoutoptions->{'args'};
220 if (defined($plugoutargs)) {
221 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
222 if ($longest > $maxoffset) {
223 $maxoffset = $longest;
224 }
225 }
226
227 # Recurse up the download hierarchy
228 $maxoffset = $self->determine_description_offset($maxoffset);
229 $self->{'option_list'} = \@optionlist;
230 return $maxoffset;
231}
232
233
234sub print_plugout_usage
235{
236 my $self = shift(@_);
237 my $descoffset = shift(@_);
238 my $isleafclass = shift(@_);
239
240 my $optionlistref = $self->{'option_list'};
241 my @optionlist = @$optionlistref;
242 my $plugoutoptions = shift(@$optionlistref);
243 return if (!defined($plugoutoptions));
244
245 my $plugoutname = $plugoutoptions->{'name'};
246 my $plugoutargs = $plugoutoptions->{'args'};
247 my $plugoutdesc = $plugoutoptions->{'desc'};
248
249 # Produce the usage information using the data structure above
250 if ($isleafclass) {
251 if (defined($plugoutdesc)) {
252 gsprintf(STDERR, "$plugoutdesc\n\n");
253 }
254 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
255 }
256
257 # Display the download options, if there are some
258 if (defined($plugoutargs)) {
259 # Calculate the column offset of the option descriptions
260 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
261
262 if ($isleafclass) {
263 gsprintf(STDERR, " {common.specific_options}:\n");
264 }
265 else {
266 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
267 }
268
269 # Display the download options
270 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
271 }
272
273 # Recurse up the download hierarchy
274 $self->print_plugout_usage($descoffset, 0);
275 $self->{'option_list'} = \@optionlist;
276}
277
278
279sub error
280{
281 my ($strFunctionName,$strError) = @_;
282 {
283 print "Error occoured in BasePlugout.pm\n".
284 "In Function: ".$strFunctionName."\n".
285 "Error Message: ".$strError."\n";
286 exit(-1);
287 }
288}
289
290# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
291sub set_OIDtype {
292 my $self = shift (@_);
293 my ($type, $metadata) = @_;
294
295 if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) {
296 $self->{'OIDtype'} = $type;
297 } else {
298 $self->{'OIDtype'} = "hash";
299 }
300 if ($type =~ /^assigned$/) {
301 if (defined $metadata) {
302 $self->{'OIDmetadata'} = $metadata;
303 } else {
304 $self->{'OIDmetadata'} = "dc.Identifier";
305 }
306 }
307}
308
309sub set_output_dir
310{
311 my $self = shift @_;
312 my ($output_dir) = @_;
313
314 $self->{'output_dir'} = $output_dir;
315}
316
317sub setoutputdir
318{
319 my $self = shift @_;
320 my ($output_dir) = @_;
321
322 $self->{'output_dir'} = $output_dir;
323}
324
325sub get_output_dir
326{
327 my $self = shift (@_);
328
329 return $self->{'output_dir'};
330}
331
332sub getoutputdir
333{
334 my $self = shift (@_);
335
336 return $self->{'output_dir'};
337}
338
339sub getoutputinfo
340{
341 my $self = shift (@_);
342
343 return $self->{'output_info'};
344}
345
346
347sub get_output_handler
348{
349 my $self = shift (@_);
350
351 my ($output_file_name) = @_;
352
353 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
354
355 return *OUTPUT;
356}
357
358sub release_output_handler
359{
360 my $self = shift (@_);
361 my ($outhandler) = @_;
362
363 close($outhandler);
364
365}
366
367sub output_xml_header {
368 my $self = shift (@_);
369 my ($handle,$docroot,$nondoctype) = @_;
370
371
372 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
373
374 #For Dspace must be UTF in lower case
375 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
376
377 if (!defined $nondoctype){
378 my $doctype = (defined $docroot) ? $docroot : "Section";
379
380 # Used to be '<!DOCTYPE Archive SYSTEM ...'
381
382 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
383 }
384
385 print $handle "<$docroot>\n" if defined $docroot;
386}
387
388sub output_xml_footer {
389 my $self = shift (@_);
390 my ($handle,$docroot) = @_;
391 print $handle "</$docroot>\n" if defined $docroot;
392}
393
394
395sub output_general_xml_header
396{
397 my $self = shift (@_);
398 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
399
400 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
401
402 if (defined $opt_dtd) {
403 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
404 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
405 }
406
407 if (defined $docroot) {
408 my $full_docroot = $docroot;
409 if (defined $opt_attributes) {
410 $full_docroot .= " $opt_attributes";
411 }
412
413 print $handle "<$full_docroot>\n"
414 }
415}
416
417sub output_general_xml_footer
418{
419 output_xml_footer(@_);
420}
421
422
423sub process {
424 my $self = shift (@_);
425 my ($doc_obj) = @_;
426
427 # for OAI purposes
428 $doc_obj->set_lastmodified();
429 $doc_obj->set_oailastmodified();
430
431 if ($self->{'group_size'} > 1) {
432 $self->group_process ($doc_obj);
433 return;
434 }
435
436 my $OID = $doc_obj->get_OID();
437 $OID = "NULL" unless defined $OID;
438
439 my $top_section = $doc_obj->get_top_section();
440
441 #get document's directory
442 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
443
444 my $output_info = $self->{'output_info'};
445 return if (!defined $output_info);
446
447 ##############################
448 # call subclass' saveas method
449 ##############################
450 $self->saveas($doc_obj,$doc_dir);
451 $self->archiveinf_db($doc_obj,$doc_dir);
452
453}
454
455sub store_output_info_reference {
456 my $self = shift (@_);
457 my ($doc_obj) = @_;
458
459 my $output_info = $self->{'output_info'};
460 my $metaname = $self->{'sortmeta'};
461 if (!defined $metaname || $metaname !~ /\S/) {
462 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
463 return;
464 }
465
466 my $metadata = "";
467 my $top_section = $doc_obj->get_top_section();
468
469 my @commameta_list = split(/,/, $metaname);
470 foreach my $cmn (@commameta_list) {
471 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
472 if ($meta) {
473 # do remove prefix/suffix - this will apply to all values
474 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
475 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
476 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
477 $metadata .= $meta if ($meta);
478 }
479 }
480
481 # store reference in the output_info
482 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
483
484}
485
486sub group_process {
487
488 my $self = shift (@_);
489 my ($doc_obj) = @_;
490
491 my $OID = $doc_obj->get_OID();
492 $OID = "NULL" unless defined $OID;
493
494 my $groupsize = $self->{'group_size'};
495 my $gs_count = $self->{'gs_count'};
496 my $open_new_file = (($gs_count % $groupsize)==0);
497 my $outhandle = $self->{'output_handle'};
498
499 # opening a new file, or document has assoicated files => directory needed
500 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
501
502 # The directory the archive file (doc.xml) and all associated files
503 # should end up in
504 my $doc_dir;
505 # If we've determined its time for a new file, open it now
506 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
507 {
508 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
509 # only if opening new file
510 my $output_dir = $self->get_output_dir();
511 &util::mk_all_dir ($output_dir) unless -e $output_dir;
512 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
513 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
514
515 if ($gs_count>0)
516 {
517 return if (!$self->close_file_output());
518 }
519
520 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
521
522
523 $self->{'gs_filename'} = $doc_file;
524 $self->{'short_doc_file'} = $short_doc_file;
525 $self->{'gs_OID'} = $OID;
526 $self->{'gs_doc_dir'} = $doc_dir;
527
528 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
529 }
530 # Otherwise load the same archive document directory used last time
531 else
532 {
533 $doc_dir = $self->{'gs_doc_dir'};
534 }
535
536 # copy all the associated files, add this information as metadata
537 # to the document
538 print $outhandle "Writing associated files to $doc_dir\n";
539 $self->process_assoc_files ($doc_obj, $doc_dir);
540
541 # look up 'gsdlmetafile' metadata and store that information
542 # explicitly in $doc_obj
543 $self->process_metafiles_metadata ($doc_obj);
544 }
545
546 # save this document
547 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
548 print GROUPPROCESS $section_text;
549
550 $self->{'gs_count'}++;
551}
552
553
554sub saveas {
555 my $self = shift (@_);
556
557 die "Basplug::saveas function must be implemented in sub classes\n";
558}
559
560sub get_doc_dir {
561 my $self = shift (@_);
562 my ($OID, $source_filename) = @_;
563
564 my $working_dir = $self->get_output_dir();
565 my $working_info = $self->{'output_info'};
566 return if (!defined $working_info);
567
568 my $doc_info = $working_info->get_info($OID);
569 my $doc_dir = '';
570
571 if (defined $doc_info && scalar(@$doc_info) >= 1)
572 {
573 # This OID already has an archives directory, so use it again
574 $doc_dir = $doc_info->[0];
575 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
576 }
577 elsif ($self->{'keep_import_structure'})
578 {
579 $source_filename = &File::Basename::dirname($source_filename);
580 $source_filename =~ s/[\\\/]+/\//g;
581 $source_filename =~ s/\/$//;
582
583 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
584 }
585
586 # We have to use a new archives directory for this document
587 if ($doc_dir eq "")
588 {
589 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
590 }
591
592 if (!defined $self->{'group'} || !$self->{'group'}){
593 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
594 }
595
596 return $doc_dir;
597}
598
599sub get_new_doc_dir{
600 my $self = shift (@_);
601 my($working_info,$working_dir,$OID) = @_;
602
603
604 my $doc_dir = "";
605 my $doc_dir_rest = $OID;
606
607 # remove any \ and / from the OID
608 $doc_dir_rest =~ s/[\\\/]//g;
609
610 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
611 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
612
613 my $doc_dir_num = 0;
614
615 do {
616 $doc_dir .= "/" if $doc_dir_num > 0;
617 if ($doc_dir_rest =~ s/^(.{1,8})//) {
618 $doc_dir .= $1;
619 $doc_dir_num++;
620 }
621 } while ($doc_dir_rest ne "" &&
622 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
623 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
624 my $i = 1;
625 my $doc_dir_base = $doc_dir;
626 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
627 $doc_dir = "$doc_dir_base-$i";
628 $i++;
629 }
630
631 return "$doc_dir.dir";
632}
633
634sub process_assoc_files {
635 my $self = shift (@_);
636 my ($doc_obj, $doc_dir, $handle) = @_;
637
638 my $outhandle = $self->{'output_handle'};
639
640 my $output_dir = $self->get_output_dir();
641 return if (!defined $output_dir);
642
643 &util::mk_all_dir ($output_dir) unless -e $output_dir;
644
645 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
646 &util::mk_all_dir ($working_dir) unless -e $working_dir;
647
648 my @assoc_files = ();
649 my $filename;;
650
651 my $source_filename = $doc_obj->get_source_filename();
652
653 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
654
655 if (defined $collect_dir) {
656 my $dirsep_regexp = &util::get_os_dirsep();
657
658 if ($collect_dir !~ /$dirsep_regexp$/) {
659 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
660 }
661
662 # This test is never going to fail on Windows -- is this a problem?
663
664 if ($source_filename !~ /^$dirsep_regexp/) {
665 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
666 }
667 }
668
669
670 # set the assocfile path (even if we have no assoc files - need this for lucene)
671 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
672 "assocfilepath",
673 "$doc_dir");
674 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
675 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
676 $dir = "" unless defined $dir;
677
678 my $utf8_real_filename = $assoc_file_rec->[0];
679
680 # for some reasons the image associate file has / before the full path
681 $utf8_real_filename =~ s/^\\(.*)/$1/i;
682
683## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
684 my $real_filename = $utf8_real_filename;
685 $real_filename = &util::downgrade_if_dos_filename($real_filename);
686
687 if (-e $real_filename) {
688
689 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
690
691 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
692
693 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
694 "gsdlassocfile",
695 "$afile:$assoc_file_rec->[2]:$dir");
696 } elsif ($self->{'verbosity'} > 1) {
697 print $outhandle "BasePlugout::process couldn't copy the associated file " .
698 "$real_filename to $afile\n";
699 }
700 }
701}
702
703
704sub process_metafiles_metadata
705{
706 my $self = shift (@_);
707 my ($doc_obj) = @_;
708
709 my $top_section = $doc_obj->get_top_section();
710 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
711
712 foreach my $metafile_pair (@$metafiles) {
713 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
714
715 $doc_obj->metadata_file($full_metafile,$metafile);
716 }
717
718 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
719}
720
721sub archiveinf_files_to_field
722{
723 my $self = shift(@_);
724 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
725
726 foreach my $file_rec (@$files) {
727 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
728 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
729 # for some reasons the image associate file has / before the full path
730 $real_filename =~ s/^\\(.*)/$1/i;
731
732 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
733
734 if (-e $raw_filename) {
735
736# if (defined $collect_dir) {
737# my $collect_dir_re_safe = $collect_dir;
738# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
739# $collect_dir_re_safe =~ s/\./\\./g;##
740
741# $real_filename =~ s/^$collect_dir_re_safe//;
742# }
743
744 if (defined $reverse_lookups) {
745 $reverse_lookups->{$real_filename} = 1;
746 }
747### push(@{$oid_files->{$field}},$full_file);
748 push(@{$oid_files->{$field}},$raw_filename);
749 }
750 else {
751 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
752 }
753 }
754}
755
756sub archiveinf_db
757{
758 my $self = shift (@_);
759 my ($doc_obj) = @_;
760
761 my $verbosity = $self->{'verbosity'};
762
763 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
764 if (defined $collect_dir) {
765 my $dirsep_regexp = &util::get_os_dirsep();
766
767 if ($collect_dir !~ /$dirsep_regexp$/) {
768 # ensure there is a slash at the end
769 $collect_dir .= &util::get_dirsep();
770 }
771 }
772
773 my $oid = $doc_obj->get_OID();
774 my $source_filename = $doc_obj->get_unmodified_source_filename();
775 my $working_info = $self->{'output_info'};
776 my $doc_info = $working_info->get_info($oid);
777
778 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
779 # doc_file is the path to the archive doc.xml. Make sure it has unix
780 # slashes, then if the collection is copied to linux, it can be built without reimport
781 $doc_file =~ s/\\/\//g;
782 my $oid_files = { 'doc-file' => $doc_file,
783 'index-status' => $index_status,
784 'src-file' => $source_filename,
785 'sort-meta' => $sortmeta,
786 'assoc-file' => [],
787 'meta-file' => [] };
788
789 my $reverse_lookups = { $source_filename => "1" };
790
791
792 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
793 $collect_dir,$oid_files,$reverse_lookups);
794
795
796 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
797 $collect_dir,$oid_files);
798
799 # Get the infodbtype value for this collection from the arcinfo object
800 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
801 my $output_dir = $self->{'output_dir'};
802
803 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
804 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
805
806 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
807
808 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
809 my $top_section = $doc_obj->get_top_section();
810
811 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
812 my $dc_title = join("; ", @$dc_titles);
813
814 if ($oid_files->{'index-status'} eq "R") {
815 $dc_title .= " (Updated)";
816 }
817
818 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
819 if (open(RSSOUT,">>$rss_filename")) {
820 print RSSOUT "<item>\n";
821 print RSSOUT " <title>$dc_title</title>\n";
822 print RSSOUT " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
823 print RSSOUT "</item>\n";
824 close(RSSOUT);
825 }
826 else {
827 print STDERR "**** Failed to open $rss_filename\n!$\n";
828 }
829
830
831 }
832
833 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
834 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
835 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
836 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
837
838 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
839 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
840 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
841
842 foreach my $rl (keys %$reverse_lookups) {
843 $working_info->add_reverseinfo($rl,$oid);
844 }
845
846 # meta files not set in reverese entry, but need to set the metadata flag
847 if (defined $doc_obj->get_meta_files()) {
848 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
849 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
850 $working_info->set_meta_file_flag($full_file);
851 }
852 }
853}
854
855
856sub set_sortmeta {
857 my $self = shift (@_);
858 my ($sortmeta, $removeprefix, $removesuffix) = @_;
859
860 $self->{'sortmeta'} = $sortmeta;
861 if (defined ($removeprefix) && $removeprefix ) {
862 $removeprefix =~ s/^\^//; # don't need a leading ^
863 $self->{'removeprefix'} = $removeprefix;
864 }
865 if (defined ($removesuffix) && $removesuffix) {
866 $removesuffix =~ s/\$$//; # don't need a trailing $
867 $self->{'removesuffix'} = $removesuffix;
868 }
869}
870
871sub open_xslt_pipe
872{
873 my $self = shift @_;
874 my ($output_file_name, $xslt_file)=@_;
875
876 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
877
878 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
879
880 my $mapping_file_path = "";
881
882 if ($ENV{'GSDLOS'} eq "windows"){
883 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
884 # this file:/// bit didn't work for me on windows XP
885 #$xslt_file = "\"file:///".$xslt_file."\"";
886 #$mapping_file_path = "\"file:///";
887 }
888 else{
889 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
890 }
891
892
893 $java_class_path = "\"".$java_class_path."\"";
894
895 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
896
897 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
898 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
899 $cmd .= "-m $mapping_file_path";
900 }
901
902 open(*XMLWRITER, $cmd)
903 or die "can't open pipe to xslt: $!";
904
905
906 $self->{'xslt_writer'} = *XMLWRITER;
907
908 print XMLWRITER "<?DocStart?>\n";
909 print XMLWRITER "$output_file_name\n";
910
911
912 }
913
914
915sub close_xslt_pipe
916{
917 my $self = shift @_;
918
919
920 return unless defined $self->{'xslt_writer'} ;
921
922 my $xsltwriter = $self->{'xslt_writer'};
923
924 print $xsltwriter "<?DocEnd?>\n";
925 close($xsltwriter);
926
927 undef $self->{'xslt_writer'};
928
929}
930
931sub close_file_output
932{
933 my ($self) = @_;
934
935 # make sure that the handle has been opened - it won't be if we failed
936 # to import any documents...
937 if (defined(fileno(GROUPPROCESS))) {
938 $self->output_xml_footer('GROUPPROCESS','Archive');
939 close GROUPPROCESS;
940 }
941
942 my $OID = $self->{'gs_OID'};
943 my $short_doc_file = $self->{'short_doc_file'};
944
945 if ($self->{'gzip'}) {
946 my $doc_file = $self->{'gs_filename'};
947 `gzip $doc_file`;
948 $doc_file .= ".gz";
949 $short_doc_file .= ".gz";
950 if (!-e $doc_file) {
951 my $outhandle = $self->{'output_handle'};
952 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
953 return 0;
954 }
955 }
956
957 # store reference in output_info
958 my $output_info = $self->{'output_info'};
959 return 0 if (!defined $output_info);
960 $output_info->add_info($OID, $short_doc_file, undef, undef);
961 return 1;
962}
963
964
965#the subclass should implement this method if is_group method could return 1.
966sub close_group_output{
967 my $self = shift (@_);
968}
969
970sub is_group {
971 my $self = shift (@_);
972 return 0;
973}
974
975my $dc_set = { Title => 1,
976 Creator => 1,
977 Subject => 1,
978 Description => 1,
979 Publisher => 1,
980 Contributor => 1,
981 Date => 1,
982 Type => 1,
983 Format => 1,
984 Identifier => 1,
985 Source => 1,
986 Language => 1,
987 Relation => 1,
988 Coverage => 1,
989 Rights => 1};
990
991
992# returns an XML representation of the dublin core metadata
993# if dc meta is not found, try ex meta
994# This method is not used by the DSpacePlugout, which has its
995# own method to save its dc metadata
996sub get_dc_metadata {
997 my $self = shift(@_);
998 my ($doc_obj, $section, $version) = @_;
999
1000 # build up string of dublin core metadata
1001 $section="" unless defined $section;
1002
1003 my $section_ptr = $doc_obj->_lookup_section($section);
1004 return "" unless defined $section_ptr;
1005
1006
1007 my $explicit_dc = {};
1008 my $explicit_ex_dc = {};
1009 my $explicit_ex = {};
1010
1011 my $all_text="";
1012
1013 # We want high quality dc metadata to go in first, so we store all the
1014 # assigned dc.* values first. Then, for all those dc metadata names in
1015 # the official dc set that are as yet unassigned, we look to see whether
1016 # embedded ex.dc.* metadata has defined some values for them. If not,
1017 # then for the same missing dc metadata names, we look in ex metadata.
1018
1019 foreach my $data (@{$section_ptr->{'metadata'}}){
1020 my $escaped_value = &docprint::escape_text($data->[1]);
1021 if ($data->[0]=~ m/^dc\./) {
1022 $data->[0] =~ tr/[A-Z]/[a-z]/;
1023
1024 $data->[0] =~ m/^dc\.(.*)/;
1025 my $dc_element = $1;
1026
1027 if (!defined $explicit_dc->{$dc_element}) {
1028 $explicit_dc->{$dc_element} = [];
1029 }
1030 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1031
1032 if (defined $version && ($version eq "oai_dc")) {
1033 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1034 }
1035 else {
1036 # qualifier???
1037 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1038 }
1039
1040 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1041 $data->[0] =~ m/^ex\.dc\.(.*)/;
1042 my $ex_dc_element = $1;
1043 my $lc_ex_dc_element = lc($ex_dc_element);
1044
1045 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1046 if (defined $dc_set->{$ex_dc_element}) {
1047 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1048 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1049 }
1050 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1051 }
1052 }
1053 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1054 $data->[0] =~ m/^(ex\.)?(.*)/;
1055 my $ex_element = $2;
1056 my $lc_ex_element = lc($ex_element);
1057
1058 if (defined $dc_set->{$ex_element}) {
1059 if (!defined $explicit_ex->{$lc_ex_element}) {
1060 $explicit_ex->{$lc_ex_element} = [];
1061 }
1062 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1063 }
1064 }
1065 }
1066
1067 # go through dc_set and for any element *not* defined in explicit_dc
1068 # that does exist in explicit_ex, add it in as metadata
1069 foreach my $k ( keys %$dc_set ) {
1070 my $lc_k = lc($k);
1071
1072 if (!defined $explicit_dc->{$lc_k}) {
1073 # try to find if ex.dc.* defines this dc.* meta,
1074 # if not, then look for whether there's an ex.* equivalent
1075
1076 if (defined $explicit_ex_dc->{$lc_k}) {
1077 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1078 my $dc_element = $lc_k;
1079 my $escaped_value = $v;
1080
1081 if (defined $version && ($version eq "oai_dc")) {
1082 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1083 }
1084 else {
1085 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1086 }
1087 }
1088 } elsif (defined $explicit_ex->{$lc_k}) {
1089 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1090 my $dc_element = $lc_k;
1091 my $escaped_value = $v;
1092
1093 if (defined $version && ($version eq "oai_dc")) {
1094 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1095 }
1096 else {
1097 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1098 }
1099 }
1100 }
1101 }
1102 }
1103
1104 if ($all_text eq "") {
1105 $all_text .= " There is no Dublin Core metatdata in this document\n";
1106 }
1107 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1108
1109 return $all_text;
1110}
1111
1112# Build up dublin_core metadata. Priority given to dc.* over ex.*
1113# This method was apparently added by Jeffrey and committed by Shaoqun.
1114# But we don't know why it was added, so not using it anymore.
1115sub new_get_dc_metadata {
1116
1117 my $self = shift(@_);
1118 my ($doc_obj, $section, $version) = @_;
1119
1120 # build up string of dublin core metadata
1121 $section="" unless defined $section;
1122
1123 my $section_ptr=$doc_obj->_lookup_section($section);
1124 return "" unless defined $section_ptr;
1125
1126 my $all_text = "";
1127 foreach my $data (@{$section_ptr->{'metadata'}}){
1128 my $escaped_value = &docprint::escape_text($data->[1]);
1129 my $dc_element = $data->[0];
1130
1131 my @array = split('\.',$dc_element);
1132 my ($type,$name);
1133
1134 if(defined $array[1])
1135 {
1136 $type = $array[0];
1137 $name = $array[1];
1138 }
1139 else
1140 {
1141 $type = "ex";
1142 $name = $array[0];
1143 }
1144
1145 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1146 }
1147 return $all_text;
1148}
1149
1150
11511;
Note: See TracBrowser for help on using the repository browser.