source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27505

Last change on this file since 27505 was 27505, checked in by jmt12, 11 years ago

Closing the RSS filehandle with the new function in FileUtils too

  • Property svn:keywords set to Author Date Id Revision
File size: 34.0 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "no",
96 'hiddengli' => "no"}
97];
98
99my $options = { 'name' => "BasePlugout",
100 'desc' => "{BasPlugout.desc}",
101 'abstract' => "yes",
102 'inherits' => "no",
103 'args' => $arguments};
104
105sub new
106{
107 my $class = shift (@_);
108
109 my ($plugoutlist,$args,$hashArgOptLists) = @_;
110 push(@$plugoutlist, $class);
111
112 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
113
114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
116
117 my $self = {};
118 $self->{'plugout_type'} = $class;
119 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
120 $self->{"info_only"} = 0;
121
122 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
123 # the args, just return the object.
124 foreach my $strArg (@{$args})
125 {
126 if(defined $strArg && $strArg eq "-gsdlinfo")
127 {
128 $self->{"info_only"} = 1;
129 return bless $self, $class;
130 }
131 }
132
133 delete $self->{"info_only"};
134
135 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
136 {
137 my $classTempClass = bless $self, $class;
138 print STDERR "<BadPlugout d=$plugout_name>\n";
139 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
140 $classTempClass->print_txt_usage(""); # Use default resource bundle
141 die "\n";
142 }
143
144
145 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
146 {
147 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
148 if (!defined $full_file_path) {
149 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
150 die "\n";
151 }
152 $self->{'xslt_file'} = $full_file_path;
153 }
154
155 $self->{'gs_count'} = 0;
156
157 $self->{'keep_import_structure'} = 0;
158
159 return bless $self, $class;
160
161}
162
163sub print_xml_usage
164{
165 my $self = shift(@_);
166 my $header = shift(@_);
167 my $high_level_information_only = shift(@_);
168
169 # XML output is always in UTF-8
170 gsprintf::output_strings_in_UTF8;
171
172 if ($header) {
173 &PrintUsage::print_xml_header("plugout");
174 }
175 $self->print_xml($high_level_information_only);
176}
177
178
179sub print_xml
180{
181 my $self = shift(@_);
182 my $high_level_information_only = shift(@_);
183
184 my $optionlistref = $self->{'option_list'};
185 my @optionlist = @$optionlistref;
186 my $plugoutoptions = shift(@$optionlistref);
187 return if (!defined($plugoutoptions));
188
189 gsprintf(STDERR, "<PlugoutInfo>\n");
190 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
191 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
192 $desc =~ s/</&amp;lt;/g; # doubly escaped
193 $desc =~ s/>/&amp;gt;/g;
194 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
195 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
196 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
197 unless (defined($high_level_information_only)) {
198 gsprintf(STDERR, " <Arguments>\n");
199 if (defined($plugoutoptions->{'args'})) {
200 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
201 }
202 gsprintf(STDERR, " </Arguments>\n");
203
204 # Recurse up the plugout hierarchy
205 $self->print_xml();
206 }
207 gsprintf(STDERR, "</PlugoutInfo>\n");
208}
209
210
211sub print_txt_usage
212{
213 my $self = shift(@_);
214
215 # Print the usage message for a plugout (recursively)
216 my $descoffset = $self->determine_description_offset(0);
217 $self->print_plugout_usage($descoffset, 1);
218}
219
220sub determine_description_offset
221{
222 my $self = shift(@_);
223 my $maxoffset = shift(@_);
224
225 my $optionlistref = $self->{'option_list'};
226 my @optionlist = @$optionlistref;
227 my $plugoutoptions = pop(@$optionlistref);
228 return $maxoffset if (!defined($plugoutoptions));
229
230 # Find the length of the longest option string of this download
231 my $plugoutargs = $plugoutoptions->{'args'};
232 if (defined($plugoutargs)) {
233 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
234 if ($longest > $maxoffset) {
235 $maxoffset = $longest;
236 }
237 }
238
239 # Recurse up the download hierarchy
240 $maxoffset = $self->determine_description_offset($maxoffset);
241 $self->{'option_list'} = \@optionlist;
242 return $maxoffset;
243}
244
245
246sub print_plugout_usage
247{
248 my $self = shift(@_);
249 my $descoffset = shift(@_);
250 my $isleafclass = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $plugoutoptions = shift(@$optionlistref);
255 return if (!defined($plugoutoptions));
256
257 my $plugoutname = $plugoutoptions->{'name'};
258 my $plugoutargs = $plugoutoptions->{'args'};
259 my $plugoutdesc = $plugoutoptions->{'desc'};
260
261 # Produce the usage information using the data structure above
262 if ($isleafclass) {
263 if (defined($plugoutdesc)) {
264 gsprintf(STDERR, "$plugoutdesc\n\n");
265 }
266 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
267 }
268
269 # Display the download options, if there are some
270 if (defined($plugoutargs)) {
271 # Calculate the column offset of the option descriptions
272 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
273
274 if ($isleafclass) {
275 gsprintf(STDERR, " {common.specific_options}:\n");
276 }
277 else {
278 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
279 }
280
281 # Display the download options
282 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
283 }
284
285 # Recurse up the download hierarchy
286 $self->print_plugout_usage($descoffset, 0);
287 $self->{'option_list'} = \@optionlist;
288}
289
290
291sub error
292{
293 my ($strFunctionName,$strError) = @_;
294 {
295 print "Error occoured in BasePlugout.pm\n".
296 "In Function: ".$strFunctionName."\n".
297 "Error Message: ".$strError."\n";
298 exit(-1);
299 }
300}
301
302# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
303sub set_OIDtype {
304 my $self = shift (@_);
305 my ($type, $metadata) = @_;
306
307 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
308 $self->{'OIDtype'} = $type;
309 } else {
310 $self->{'OIDtype'} = "hash";
311 }
312 if ($type =~ /^assigned$/) {
313 if (defined $metadata) {
314 $self->{'OIDmetadata'} = $metadata;
315 } else {
316 $self->{'OIDmetadata'} = "dc.Identifier";
317 }
318 }
319}
320
321sub set_output_dir
322{
323 my $self = shift @_;
324 my ($output_dir) = @_;
325
326 $self->{'output_dir'} = $output_dir;
327}
328
329sub setoutputdir
330{
331 my $self = shift @_;
332 my ($output_dir) = @_;
333
334 $self->{'output_dir'} = $output_dir;
335}
336
337sub get_output_dir
338{
339 my $self = shift (@_);
340
341 return $self->{'output_dir'};
342}
343
344sub getoutputdir
345{
346 my $self = shift (@_);
347
348 return $self->{'output_dir'};
349}
350
351sub getoutputinfo
352{
353 my $self = shift (@_);
354
355 return $self->{'output_info'};
356}
357
358
359sub get_output_handler
360{
361 my $self = shift (@_);
362
363 my ($output_file_name) = @_;
364
365 my $fh;
366 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
367
368 return $fh;
369}
370
371sub release_output_handler
372{
373 my $self = shift (@_);
374 my ($outhandler) = @_;
375
376 close($outhandler);
377
378}
379
380sub output_xml_header {
381 my $self = shift (@_);
382 my ($handle,$docroot,$nondoctype) = @_;
383
384
385 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
386
387 #For Dspace must be UTF in lower case
388 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
389
390 if (!defined $nondoctype){
391 my $doctype = (defined $docroot) ? $docroot : "Section";
392
393 # Used to be '<!DOCTYPE Archive SYSTEM ...'
394
395 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
396 }
397
398 print $handle "<$docroot>\n" if defined $docroot;
399}
400
401sub output_xml_footer {
402 my $self = shift (@_);
403 my ($handle,$docroot) = @_;
404 print $handle "</$docroot>\n" if defined $docroot;
405}
406
407
408sub output_general_xml_header
409{
410 my $self = shift (@_);
411 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
412
413 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
414
415 if (defined $opt_dtd) {
416 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
417 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
418 }
419
420 if (defined $docroot) {
421 my $full_docroot = $docroot;
422 if (defined $opt_attributes) {
423 $full_docroot .= " $opt_attributes";
424 }
425
426 print $handle "<$full_docroot>\n"
427 }
428}
429
430sub output_general_xml_footer
431{
432 output_xml_footer(@_);
433}
434
435
436sub process {
437 my $self = shift (@_);
438 my ($doc_obj) = @_;
439
440 # for OAI purposes
441 $doc_obj->set_lastmodified();
442 $doc_obj->set_oailastmodified();
443
444 if ($self->{'group_size'} > 1) {
445 $self->group_process ($doc_obj);
446 return;
447 }
448
449 my $OID = $doc_obj->get_OID();
450 $OID = "NULL" unless defined $OID;
451
452 my $top_section = $doc_obj->get_top_section();
453
454 #get document's directory
455 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
456
457 my $output_info = $self->{'output_info'};
458 return if (!defined $output_info);
459
460 ##############################
461 # call subclass' saveas method
462 ##############################
463 $self->saveas($doc_obj,$doc_dir);
464 $self->archiveinf_db($doc_obj,$doc_dir);
465
466}
467
468sub store_output_info_reference {
469 my $self = shift (@_);
470 my ($doc_obj) = @_;
471
472 my $output_info = $self->{'output_info'};
473 my $metaname = $self->{'sortmeta'};
474 if (!defined $metaname || $metaname !~ /\S/) {
475 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
476 return;
477 }
478
479 my $metadata = "";
480 my $top_section = $doc_obj->get_top_section();
481
482 my @commameta_list = split(/,/, $metaname);
483 foreach my $cmn (@commameta_list) {
484 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
485 if ($meta) {
486 # do remove prefix/suffix - this will apply to all values
487 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
488 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
489 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
490 $metadata .= $meta if ($meta);
491 }
492 }
493
494 # store reference in the output_info
495 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
496
497}
498
499sub group_process {
500
501 my $self = shift (@_);
502 my ($doc_obj) = @_;
503
504 my $OID = $doc_obj->get_OID();
505 $OID = "NULL" unless defined $OID;
506
507 my $groupsize = $self->{'group_size'};
508 my $gs_count = $self->{'gs_count'};
509 my $open_new_file = (($gs_count % $groupsize)==0);
510 my $outhandle = $self->{'output_handle'};
511
512 # opening a new file, or document has assoicated files => directory needed
513 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
514
515 # The directory the archive file (doc.xml) and all associated files
516 # should end up in
517 my $doc_dir;
518 # If we've determined its time for a new file, open it now
519 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
520 {
521 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
522 # only if opening new file
523 my $output_dir = $self->get_output_dir();
524 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
525 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
526 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
527
528 if ($gs_count>0)
529 {
530 return if (!$self->close_file_output());
531 }
532
533 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
534
535
536 $self->{'gs_filename'} = $doc_file;
537 $self->{'short_doc_file'} = $short_doc_file;
538 $self->{'gs_OID'} = $OID;
539 $self->{'gs_doc_dir'} = $doc_dir;
540
541 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
542 }
543 # Otherwise load the same archive document directory used last time
544 else
545 {
546 $doc_dir = $self->{'gs_doc_dir'};
547 }
548
549 # copy all the associated files, add this information as metadata
550 # to the document
551 print $outhandle "Writing associated files to $doc_dir\n";
552 $self->process_assoc_files ($doc_obj, $doc_dir);
553
554 # look up 'gsdlmetafile' metadata and store that information
555 # explicitly in $doc_obj
556 $self->process_metafiles_metadata ($doc_obj);
557 }
558
559 # save this document
560 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
561 print GROUPPROCESS $section_text;
562
563 $self->{'gs_count'}++;
564}
565
566
567sub saveas {
568 my $self = shift (@_);
569
570 die "Basplug::saveas function must be implemented in sub classes\n";
571}
572
573sub get_doc_dir {
574 my $self = shift (@_);
575 my ($OID, $source_filename) = @_;
576
577 my $working_dir = $self->get_output_dir();
578 my $working_info = $self->{'output_info'};
579 return if (!defined $working_info);
580
581 my $doc_info = $working_info->get_info($OID);
582 my $doc_dir = '';
583
584 if (defined $doc_info && scalar(@$doc_info) >= 1)
585 {
586 # This OID already has an archives directory, so use it again
587 $doc_dir = $doc_info->[0];
588 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
589 }
590 elsif ($self->{'keep_import_structure'})
591 {
592 $source_filename = &File::Basename::dirname($source_filename);
593 $source_filename =~ s/[\\\/]+/\//g;
594 $source_filename =~ s/\/$//;
595
596 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
597 }
598
599 # We have to use a new archives directory for this document
600 if ($doc_dir eq "")
601 {
602 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
603 }
604
605 if (!defined $self->{'group'} || !$self->{'group'}){
606 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
607 }
608
609 return $doc_dir;
610}
611
612# we now create the directory as part of this call, to try and avoid race
613# conditions caused by parallel processing [jmt12]
614sub get_new_doc_dir{
615 my $self = shift (@_);
616 my($working_info,$working_dir,$OID) = @_;
617
618
619 my $doc_dir = "";
620 my $doc_dir_rest = $OID;
621
622 # remove any \ and / from the OID
623 $doc_dir_rest =~ s/[\\\/]//g;
624
625 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
626 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
627
628 my $doc_dir_num = 0;
629 my $created_directory = 0;
630 do {
631 $doc_dir .= "/" if $doc_dir_num > 0;
632 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
633 if ($self->{'subdir_hash_prefix'})
634 {
635 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
636 }
637 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
638 if ($doc_dir_rest =~ s/$pattern//i)
639 {
640 $doc_dir .= $1;
641 $doc_dir_num++;
642 }
643 my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
644 $created_directory = &FileUtils::makeAllDirectories($new_dir);
645 #rint STDERR "[DEBUG] create directory: $new_dir => $created_directory\n";
646 #rint STDERR "[DEBUG] rest: $doc_dir_rest\n";
647 #rint STDERR "[DEBUG] working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
648 #rint STDERR "[DEBUG] doc_dir_num: " . $doc_dir_num . "\n";
649 } while ($doc_dir_rest ne "" && $created_directory == 0 && $doc_dir_num < 2);
650 # what is working_info, because it's values seems to start at 1?
651 # ($working_info->size() >= 1024)
652 my $i = 1;
653 my $doc_dir_base = $doc_dir;
654 while ($created_directory == 0 && $i < 256) {
655 $doc_dir = "$doc_dir_base-$i";
656 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
657 $i++;
658 }
659
660 if (!$created_directory)
661 {
662 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
663 }
664
665 return "$doc_dir.dir";
666}
667
668sub process_assoc_files {
669 my $self = shift (@_);
670 my ($doc_obj, $doc_dir, $handle) = @_;
671
672 my $outhandle = $self->{'output_handle'};
673
674 my $output_dir = $self->get_output_dir();
675 return if (!defined $output_dir);
676
677 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
678
679 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
680 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
681
682 my @assoc_files = ();
683 my $filename;;
684
685 my $source_filename = $doc_obj->get_source_filename();
686
687 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
688
689 if (defined $collect_dir) {
690 my $dirsep_regexp = &util::get_os_dirsep();
691
692 if ($collect_dir !~ /$dirsep_regexp$/) {
693 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
694 }
695
696 # This test is never going to fail on Windows -- is this a problem?
697
698 if ($source_filename !~ /^$dirsep_regexp/) {
699 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
700 }
701 }
702
703
704 # set the assocfile path (even if we have no assoc files - need this for lucene)
705 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
706 "assocfilepath",
707 "$doc_dir");
708 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
709 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
710 $dir = "" unless defined $dir;
711
712 my $utf8_real_filename = $assoc_file_rec->[0];
713
714 # for some reasons the image associate file has / before the full path
715 $utf8_real_filename =~ s/^\\(.*)/$1/i;
716
717## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
718 my $real_filename = $utf8_real_filename;
719 $real_filename = &util::downgrade_if_dos_filename($real_filename);
720
721 if (&FileUtils::fileExists($real_filename)) {
722
723 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
724
725 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
726
727 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
728 "gsdlassocfile",
729 "$afile:$assoc_file_rec->[2]:$dir");
730 } elsif ($self->{'verbosity'} > 1) {
731 print $outhandle "BasePlugout::process couldn't copy the associated file " .
732 "$real_filename to $afile\n";
733 }
734 }
735}
736
737
738sub process_metafiles_metadata
739{
740 my $self = shift (@_);
741 my ($doc_obj) = @_;
742
743 my $top_section = $doc_obj->get_top_section();
744 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
745
746 foreach my $metafile_pair (@$metafiles) {
747 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
748
749 $doc_obj->metadata_file($full_metafile,$metafile);
750 }
751
752 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
753}
754
755sub archiveinf_files_to_field
756{
757 my $self = shift(@_);
758 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
759
760 foreach my $file_rec (@$files) {
761 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
762 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
763 # for some reasons the image associate file has / before the full path
764 $real_filename =~ s/^\\(.*)/$1/i;
765
766 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
767
768 if (&FileUtils::fileExists($raw_filename)) {
769
770# if (defined $collect_dir) {
771# my $collect_dir_re_safe = $collect_dir;
772# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
773# $collect_dir_re_safe =~ s/\./\\./g;##
774
775# $real_filename =~ s/^$collect_dir_re_safe//;
776# }
777
778 if (defined $reverse_lookups) {
779 $reverse_lookups->{$real_filename} = 1;
780 }
781### push(@{$oid_files->{$field}},$full_file);
782 push(@{$oid_files->{$field}},$raw_filename);
783 }
784 else {
785 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
786 }
787 }
788}
789
790sub archiveinf_db
791{
792 my $self = shift (@_);
793 my ($doc_obj) = @_;
794
795 my $verbosity = $self->{'verbosity'};
796
797 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
798 if (defined $collect_dir) {
799 my $dirsep_regexp = &util::get_os_dirsep();
800
801 if ($collect_dir !~ /$dirsep_regexp$/) {
802 # ensure there is a slash at the end
803 $collect_dir .= &util::get_dirsep();
804 }
805 }
806
807 my $oid = $doc_obj->get_OID();
808 my $source_filename = $doc_obj->get_unmodified_source_filename();
809 my $working_info = $self->{'output_info'};
810 my $doc_info = $working_info->get_info($oid);
811
812 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
813 # doc_file is the path to the archive doc.xml. Make sure it has unix
814 # slashes, then if the collection is copied to linux, it can be built without reimport
815 $doc_file =~ s/\\/\//g;
816 my $oid_files = { 'doc-file' => $doc_file,
817 'index-status' => $index_status,
818 'src-file' => $source_filename,
819 'sort-meta' => $sortmeta,
820 'assoc-file' => [],
821 'meta-file' => [] };
822
823 my $reverse_lookups = { $source_filename => "1" };
824
825
826 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
827 $collect_dir,$oid_files,$reverse_lookups);
828
829
830 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
831 $collect_dir,$oid_files);
832
833 # Get the infodbtype value for this collection from the arcinfo object
834 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
835 my $output_dir = $self->{'output_dir'};
836
837 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
838 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
839
840 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
841
842 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
843 my $top_section = $doc_obj->get_top_section();
844
845 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
846 my $dc_title = join("; ", @$dc_titles);
847
848 if ($oid_files->{'index-status'} eq "R") {
849 $dc_title .= " (Updated)";
850 }
851
852 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
853 my $rss_fh;
854 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh)) {
855 print $rss_fh "<item>\n";
856 print $rss_fh " <title>$dc_title</title>\n";
857 print $rss_fh " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
858 print $rss_fh "</item>\n";
859 &FileUtils::closeFileHandle($rss_filename, $rss_fh);
860 }
861 else {
862 print STDERR "**** Failed to open $rss_filename\n!$\n";
863 }
864
865
866 }
867
868 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
869 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
870 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
871 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
872
873 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
874 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
875 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
876
877 foreach my $rl (keys %$reverse_lookups) {
878 $working_info->add_reverseinfo($rl,$oid);
879 }
880
881 # meta files not set in reverese entry, but need to set the metadata flag
882 if (defined $doc_obj->get_meta_files()) {
883 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
884 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
885 $working_info->set_meta_file_flag($full_file);
886 }
887 }
888}
889
890
891sub set_sortmeta {
892 my $self = shift (@_);
893 my ($sortmeta, $removeprefix, $removesuffix) = @_;
894
895 $self->{'sortmeta'} = $sortmeta;
896 if (defined ($removeprefix) && $removeprefix ) {
897 $removeprefix =~ s/^\^//; # don't need a leading ^
898 $self->{'removeprefix'} = $removeprefix;
899 }
900 if (defined ($removesuffix) && $removesuffix) {
901 $removesuffix =~ s/\$$//; # don't need a trailing $
902 $self->{'removesuffix'} = $removesuffix;
903 }
904}
905
906sub open_xslt_pipe
907{
908 my $self = shift @_;
909 my ($output_file_name, $xslt_file)=@_;
910
911 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
912
913 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
914
915 my $mapping_file_path = "";
916
917 if ($ENV{'GSDLOS'} eq "windows"){
918 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
919 # this file:/// bit didn't work for me on windows XP
920 #$xslt_file = "\"file:///".$xslt_file."\"";
921 #$mapping_file_path = "\"file:///";
922 }
923 else{
924 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
925 }
926
927
928 $java_class_path = "\"".$java_class_path."\"";
929
930 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
931
932 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
933 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
934 $cmd .= "-m $mapping_file_path";
935 }
936
937 open(*XMLWRITER, $cmd)
938 or die "can't open pipe to xslt: $!";
939
940
941 $self->{'xslt_writer'} = *XMLWRITER;
942
943 print XMLWRITER "<?DocStart?>\n";
944 print XMLWRITER "$output_file_name\n";
945
946
947 }
948
949
950sub close_xslt_pipe
951{
952 my $self = shift @_;
953
954
955 return unless defined $self->{'xslt_writer'} ;
956
957 my $xsltwriter = $self->{'xslt_writer'};
958
959 print $xsltwriter "<?DocEnd?>\n";
960 close($xsltwriter);
961
962 undef $self->{'xslt_writer'};
963
964}
965
966sub close_file_output
967{
968 my ($self) = @_;
969
970 # make sure that the handle has been opened - it won't be if we failed
971 # to import any documents...
972 if (defined(fileno(GROUPPROCESS))) {
973 $self->output_xml_footer('GROUPPROCESS','Archive');
974 close GROUPPROCESS;
975 }
976
977 my $OID = $self->{'gs_OID'};
978 my $short_doc_file = $self->{'short_doc_file'};
979
980 if ($self->{'gzip'}) {
981 my $doc_file = $self->{'gs_filename'};
982 `gzip $doc_file`;
983 $doc_file .= ".gz";
984 $short_doc_file .= ".gz";
985 if (!&FileUtils::fileExists($doc_file)) {
986 my $outhandle = $self->{'output_handle'};
987 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
988 return 0;
989 }
990 }
991
992 # store reference in output_info
993 my $output_info = $self->{'output_info'};
994 return 0 if (!defined $output_info);
995 $output_info->add_info($OID, $short_doc_file, undef, undef);
996 return 1;
997}
998
999
1000#the subclass should implement this method if is_group method could return 1.
1001sub close_group_output{
1002 my $self = shift (@_);
1003}
1004
1005sub is_group {
1006 my $self = shift (@_);
1007 return 0;
1008}
1009
1010my $dc_set = { Title => 1,
1011 Creator => 1,
1012 Subject => 1,
1013 Description => 1,
1014 Publisher => 1,
1015 Contributor => 1,
1016 Date => 1,
1017 Type => 1,
1018 Format => 1,
1019 Identifier => 1,
1020 Source => 1,
1021 Language => 1,
1022 Relation => 1,
1023 Coverage => 1,
1024 Rights => 1};
1025
1026
1027# returns an XML representation of the dublin core metadata
1028# if dc meta is not found, try ex meta
1029# This method is not used by the DSpacePlugout, which has its
1030# own method to save its dc metadata
1031sub get_dc_metadata {
1032 my $self = shift(@_);
1033 my ($doc_obj, $section, $version) = @_;
1034
1035 # build up string of dublin core metadata
1036 $section="" unless defined $section;
1037
1038 my $section_ptr = $doc_obj->_lookup_section($section);
1039 return "" unless defined $section_ptr;
1040
1041
1042 my $explicit_dc = {};
1043 my $explicit_ex_dc = {};
1044 my $explicit_ex = {};
1045
1046 my $all_text="";
1047
1048 # We want high quality dc metadata to go in first, so we store all the
1049 # assigned dc.* values first. Then, for all those dc metadata names in
1050 # the official dc set that are as yet unassigned, we look to see whether
1051 # embedded ex.dc.* metadata has defined some values for them. If not,
1052 # then for the same missing dc metadata names, we look in ex metadata.
1053
1054 foreach my $data (@{$section_ptr->{'metadata'}}){
1055 my $escaped_value = &docprint::escape_text($data->[1]);
1056 if ($data->[0]=~ m/^dc\./) {
1057 $data->[0] =~ tr/[A-Z]/[a-z]/;
1058
1059 $data->[0] =~ m/^dc\.(.*)/;
1060 my $dc_element = $1;
1061
1062 if (!defined $explicit_dc->{$dc_element}) {
1063 $explicit_dc->{$dc_element} = [];
1064 }
1065 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1066
1067 if (defined $version && ($version eq "oai_dc")) {
1068 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1069 }
1070 else {
1071 # qualifier???
1072 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1073 }
1074
1075 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1076 $data->[0] =~ m/^ex\.dc\.(.*)/;
1077 my $ex_dc_element = $1;
1078 my $lc_ex_dc_element = lc($ex_dc_element);
1079
1080 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1081 if (defined $dc_set->{$ex_dc_element}) {
1082 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1083 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1084 }
1085 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1086 }
1087 }
1088 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1089 $data->[0] =~ m/^(ex\.)?(.*)/;
1090 my $ex_element = $2;
1091 my $lc_ex_element = lc($ex_element);
1092
1093 if (defined $dc_set->{$ex_element}) {
1094 if (!defined $explicit_ex->{$lc_ex_element}) {
1095 $explicit_ex->{$lc_ex_element} = [];
1096 }
1097 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1098 }
1099 }
1100 }
1101
1102 # go through dc_set and for any element *not* defined in explicit_dc
1103 # that does exist in explicit_ex, add it in as metadata
1104 foreach my $k ( keys %$dc_set ) {
1105 my $lc_k = lc($k);
1106
1107 if (!defined $explicit_dc->{$lc_k}) {
1108 # try to find if ex.dc.* defines this dc.* meta,
1109 # if not, then look for whether there's an ex.* equivalent
1110
1111 if (defined $explicit_ex_dc->{$lc_k}) {
1112 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1113 my $dc_element = $lc_k;
1114 my $escaped_value = $v;
1115
1116 if (defined $version && ($version eq "oai_dc")) {
1117 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1118 }
1119 else {
1120 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1121 }
1122 }
1123 } elsif (defined $explicit_ex->{$lc_k}) {
1124 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1125 my $dc_element = $lc_k;
1126 my $escaped_value = $v;
1127
1128 if (defined $version && ($version eq "oai_dc")) {
1129 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1130 }
1131 else {
1132 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1133 }
1134 }
1135 }
1136 }
1137 }
1138
1139 if ($all_text eq "") {
1140 $all_text .= " There is no Dublin Core metatdata in this document\n";
1141 }
1142 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1143
1144 return $all_text;
1145}
1146
1147# Build up dublin_core metadata. Priority given to dc.* over ex.*
1148# This method was apparently added by Jeffrey and committed by Shaoqun.
1149# But we don't know why it was added, so not using it anymore.
1150sub new_get_dc_metadata {
1151
1152 my $self = shift(@_);
1153 my ($doc_obj, $section, $version) = @_;
1154
1155 # build up string of dublin core metadata
1156 $section="" unless defined $section;
1157
1158 my $section_ptr=$doc_obj->_lookup_section($section);
1159 return "" unless defined $section_ptr;
1160
1161 my $all_text = "";
1162 foreach my $data (@{$section_ptr->{'metadata'}}){
1163 my $escaped_value = &docprint::escape_text($data->[1]);
1164 my $dc_element = $data->[0];
1165
1166 my @array = split('\.',$dc_element);
1167 my ($type,$name);
1168
1169 if(defined $array[1])
1170 {
1171 $type = $array[0];
1172 $name = $array[1];
1173 }
1174 else
1175 {
1176 $type = "ex";
1177 $name = $array[0];
1178 }
1179
1180 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1181 }
1182 return $all_text;
1183}
1184
1185
11861;
Note: See TracBrowser for help on using the repository browser.