source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 28642

Last change on this file since 28642 was 28642, checked in by kjdon, 10 years ago

group processing code was GreenstoneXML format so moved it into GreenstoneXMLPlugout. tidying up the code. reordered options. group processing now writes out the correct archivesinf databases.

  • Property svn:keywords set to Author Date Id Revision
File size: 34.7 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "xslt_file",
47 'desc' => "{BasPlugout.xslt_file}",
48 'type' => "string",
49 'reqd' => "no",
50 'deft' => "",
51 'hiddengli' => "no"},
52 { 'name' => "subdir_split_length",
53 'desc' => "{BasPlugout.subdir_split_length}",
54 'type' => "int",
55 'reqd' => "no",
56 'deft' => "8",
57 'hiddengli' => "no"},
58 { 'name' => "subdir_hash_prefix",
59 'desc' => "{BasPlugout.subdir_hash_prefix}",
60 'type' => "flag",
61 'reqd' => "no",
62 'deft' => "0",
63 'hiddengli' => "no"},
64 { 'name' => "gzip_output",
65 'desc' => "{BasPlugout.gzip_output}",
66 'type' => "flag",
67 'reqd' => "no",
68 'hiddengli' => "no"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "output_info",
76 'desc' => "{BasPlugout.output_info}",
77 'type' => "string",
78 'reqd' => "yes",
79 'hiddengli' => "yes"},
80 { 'name' => "output_handle",
81 'desc' => "{BasPlugout.output_handle}",
82 'type' => "string",
83 'deft' => 'STDERR',
84 'reqd' => "no",
85 'hiddengli' => "yes"},
86 { 'name' => "debug",
87 'desc' => "{BasPlugout.debug}",
88 'type' => "flag",
89 'reqd' => "no",
90 'hiddengli' => "yes"},
91 { 'name' => 'no_rss',
92 'desc' => '{BasPlugout.no_rss}',
93 'type' => 'flag',
94 'reqd' => 'no',
95 'hiddengli' => 'yes'},
96];
97
98my $options = { 'name' => "BasePlugout",
99 'desc' => "{BasPlugout.desc}",
100 'abstract' => "yes",
101 'inherits' => "no",
102 'args' => $arguments};
103
104sub new
105{
106 my $class = shift (@_);
107
108 my ($plugoutlist,$args,$hashArgOptLists) = @_;
109 push(@$plugoutlist, $class);
110
111 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
112
113 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
114 push(@{$hashArgOptLists->{"OptList"}},$options);
115
116 my $self = {};
117 $self->{'plugout_type'} = $class;
118 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
119 $self->{"info_only"} = 0;
120
121 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
122 # the args, just return the object.
123 foreach my $strArg (@{$args})
124 {
125 if(defined $strArg && $strArg eq "-gsdlinfo")
126 {
127 $self->{"info_only"} = 1;
128 return bless $self, $class;
129 }
130 }
131
132 delete $self->{"info_only"};
133
134 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
135 {
136 my $classTempClass = bless $self, $class;
137 print STDERR "<BadPlugout d=$plugout_name>\n";
138 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
139 $classTempClass->print_txt_usage(""); # Use default resource bundle
140 die "\n";
141 }
142
143
144 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
145 {
146 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
147 if (!defined $full_file_path) {
148 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
149 die "\n";
150 }
151 $self->{'xslt_file'} = $full_file_path;
152 }
153
154 # for group processing
155 $self->{'gs_count'} = 0;
156 $self->{'group_position'} = 1;
157
158 $self->{'keep_import_structure'} = 0;
159
160 return bless $self, $class;
161
162}
163
164sub print_xml_usage
165{
166 my $self = shift(@_);
167 my $header = shift(@_);
168 my $high_level_information_only = shift(@_);
169
170 # XML output is always in UTF-8
171 gsprintf::output_strings_in_UTF8;
172
173 if ($header) {
174 &PrintUsage::print_xml_header("plugout");
175 }
176 $self->print_xml($high_level_information_only);
177}
178
179
180sub print_xml
181{
182 my $self = shift(@_);
183 my $high_level_information_only = shift(@_);
184
185 my $optionlistref = $self->{'option_list'};
186 my @optionlist = @$optionlistref;
187 my $plugoutoptions = shift(@$optionlistref);
188 return if (!defined($plugoutoptions));
189
190 gsprintf(STDERR, "<PlugoutInfo>\n");
191 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
192 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
193 $desc =~ s/</&amp;lt;/g; # doubly escaped
194 $desc =~ s/>/&amp;gt;/g;
195 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
196 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
197 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
198 unless (defined($high_level_information_only)) {
199 gsprintf(STDERR, " <Arguments>\n");
200 if (defined($plugoutoptions->{'args'})) {
201 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
202 }
203 gsprintf(STDERR, " </Arguments>\n");
204
205 # Recurse up the plugout hierarchy
206 $self->print_xml();
207 }
208 gsprintf(STDERR, "</PlugoutInfo>\n");
209}
210
211
212sub print_txt_usage
213{
214 my $self = shift(@_);
215
216 # Print the usage message for a plugout (recursively)
217 my $descoffset = $self->determine_description_offset(0);
218 $self->print_plugout_usage($descoffset, 1);
219}
220
221sub determine_description_offset
222{
223 my $self = shift(@_);
224 my $maxoffset = shift(@_);
225
226 my $optionlistref = $self->{'option_list'};
227 my @optionlist = @$optionlistref;
228 my $plugoutoptions = pop(@$optionlistref);
229 return $maxoffset if (!defined($plugoutoptions));
230
231 # Find the length of the longest option string of this download
232 my $plugoutargs = $plugoutoptions->{'args'};
233 if (defined($plugoutargs)) {
234 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
235 if ($longest > $maxoffset) {
236 $maxoffset = $longest;
237 }
238 }
239
240 # Recurse up the download hierarchy
241 $maxoffset = $self->determine_description_offset($maxoffset);
242 $self->{'option_list'} = \@optionlist;
243 return $maxoffset;
244}
245
246
247sub print_plugout_usage
248{
249 my $self = shift(@_);
250 my $descoffset = shift(@_);
251 my $isleafclass = shift(@_);
252
253 my $optionlistref = $self->{'option_list'};
254 my @optionlist = @$optionlistref;
255 my $plugoutoptions = shift(@$optionlistref);
256 return if (!defined($plugoutoptions));
257
258 my $plugoutname = $plugoutoptions->{'name'};
259 my $plugoutargs = $plugoutoptions->{'args'};
260 my $plugoutdesc = $plugoutoptions->{'desc'};
261
262 # Produce the usage information using the data structure above
263 if ($isleafclass) {
264 if (defined($plugoutdesc)) {
265 gsprintf(STDERR, "$plugoutdesc\n\n");
266 }
267 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
268 }
269
270 # Display the download options, if there are some
271 if (defined($plugoutargs)) {
272 # Calculate the column offset of the option descriptions
273 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
274
275 if ($isleafclass) {
276 gsprintf(STDERR, " {common.specific_options}:\n");
277 }
278 else {
279 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
280 }
281
282 # Display the download options
283 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
284 }
285
286 # Recurse up the download hierarchy
287 $self->print_plugout_usage($descoffset, 0);
288 $self->{'option_list'} = \@optionlist;
289}
290
291
292sub error
293{
294 my ($strFunctionName,$strError) = @_;
295 {
296 print "Error occoured in BasePlugout.pm\n".
297 "In Function: ".$strFunctionName."\n".
298 "Error Message: ".$strError."\n";
299 exit(-1);
300 }
301}
302
303# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
304sub set_OIDtype {
305 my $self = shift (@_);
306 my ($type, $metadata) = @_;
307
308 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
309 $self->{'OIDtype'} = $type;
310 } else {
311 $self->{'OIDtype'} = "hash";
312 }
313 if ($type =~ /^assigned$/) {
314 if (defined $metadata) {
315 $self->{'OIDmetadata'} = $metadata;
316 } else {
317 $self->{'OIDmetadata'} = "dc.Identifier";
318 }
319 }
320}
321
322sub set_output_dir
323{
324 my $self = shift @_;
325 my ($output_dir) = @_;
326
327 $self->{'output_dir'} = $output_dir;
328}
329
330sub setoutputdir
331{
332 my $self = shift @_;
333 my ($output_dir) = @_;
334
335 $self->{'output_dir'} = $output_dir;
336}
337
338sub get_output_dir
339{
340 my $self = shift (@_);
341
342 return $self->{'output_dir'};
343}
344
345sub getoutputdir
346{
347 my $self = shift (@_);
348
349 return $self->{'output_dir'};
350}
351
352sub getoutputinfo
353{
354 my $self = shift (@_);
355
356 return $self->{'output_info'};
357}
358
359
360sub get_output_handler
361{
362 my $self = shift (@_);
363
364 my ($output_file_name) = @_;
365
366 my $fh;
367 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
368
369 return $fh;
370}
371
372sub release_output_handler
373{
374 my $self = shift (@_);
375 my ($outhandler) = @_;
376
377 close($outhandler);
378
379}
380
381sub output_xml_header {
382 my $self = shift (@_);
383 my ($handle,$docroot,$nondoctype) = @_;
384
385
386 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
387
388 #For Dspace must be UTF in lower case
389 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
390
391 if (!defined $nondoctype){
392 my $doctype = (defined $docroot) ? $docroot : "Section";
393
394 # Used to be '<!DOCTYPE Archive SYSTEM ...'
395
396 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
397 }
398
399 print $handle "<$docroot>\n" if defined $docroot;
400}
401
402sub output_xml_footer {
403 my $self = shift (@_);
404 my ($handle,$docroot) = @_;
405 print $handle "</$docroot>\n" if defined $docroot;
406}
407
408
409sub output_general_xml_header
410{
411 my $self = shift (@_);
412 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
413
414 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
415
416 if (defined $opt_dtd) {
417 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
418 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
419 }
420
421 if (defined $docroot) {
422 my $full_docroot = $docroot;
423 if (defined $opt_attributes) {
424 $full_docroot .= " $opt_attributes";
425 }
426
427 print $handle "<$full_docroot>\n"
428 }
429}
430
431sub output_general_xml_footer
432{
433 output_xml_footer(@_);
434}
435
436# This is called by the plugins after read_into_doc_obj generates the doc_obj.
437sub process {
438 my $self = shift (@_);
439 my ($doc_obj) = @_;
440
441 my $output_info = $self->{'output_info'};
442 return if (!defined $output_info);
443
444 # for OAI purposes
445 $doc_obj->set_lastmodified();
446 $doc_obj->set_oailastmodified();
447
448 # find out which directory to save to
449 my $doc_dir = "";
450 if ($self->is_group()) {
451 $doc_dir = $self->get_group_doc_dir($doc_obj);
452 } else {
453 $doc_dir = $self->get_doc_dir($doc_obj);
454 }
455
456 ##############################
457 # call subclass' saveas method
458 ##############################
459 $self->saveas($doc_obj,$doc_dir);
460
461 # write out data to archiveinf-doc.db
462 $self->archiveinf_db($doc_obj);
463
464 if ($self->is_group()) {
465 $self->{'gs_count'}++; # do we want this for all cases?
466 $self->{'group_position'}++;
467 }
468}
469
470sub store_output_info_reference {
471 my $self = shift (@_);
472 my ($doc_obj) = @_;
473
474 my $output_info = $self->{'output_info'};
475 my $metaname = $self->{'sortmeta'};
476
477 my $group_position;
478 if ($self->is_group()) {
479 $group_position = $self->{'group_position'};
480 }
481 if (!defined $metaname || $metaname !~ /\S/) {
482 my $OID = $doc_obj->get_OID();
483 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
484 return;
485 }
486
487 if ($metaname eq "OID") { # sort by OID
488 my $OID = $doc_obj->get_OID();
489 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
490 return;
491 }
492
493 my $metadata = "";
494 my $top_section = $doc_obj->get_top_section();
495
496 my @commameta_list = split(/,/, $metaname);
497 foreach my $cmn (@commameta_list) {
498 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
499 if ($meta) {
500 # do remove prefix/suffix - this will apply to all values
501 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
502 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
503 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
504 $metadata .= $meta if ($meta);
505 }
506 }
507
508 # store reference in the output_info
509 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
510
511}
512
513
514
515sub saveas {
516 my $self = shift (@_);
517 my ($doc_obj, $doc_dir) = @_;
518
519 die "Basplug::saveas function must be implemented in sub classes\n";
520}
521
522sub get_group_doc_dir {
523 my $self = shift (@_);
524 my ($doc_obj) = @_;
525
526 my $outhandle = $self->{'output_handle'};
527 my $OID = $doc_obj->get_OID();
528 $OID = "NULL" unless defined $OID;
529
530 my $groupsize = $self->{'group_size'};
531 my $gs_count = $self->{'gs_count'};
532 my $open_new_file = (($gs_count % $groupsize)==0);
533
534 my $doc_dir;
535
536 if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
537 # if we have some assoc files, then we will need to start a new file
538 if ($self->{'verbosity'} > 2) {
539 print $outhandle " Starting a archives folder for $OID as it has associated files\n";
540 }
541 $open_new_file = 1;
542 }
543
544 # opening a new file
545 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) {
546 # first we close off the old output
547 if ($gs_count>0)
548 {
549 return if (!$self->close_group_output());
550 }
551
552 # this will create the directory
553 $doc_dir = $self->get_doc_dir ($doc_obj);
554 $self->{'new_doc_dir'} = 1;
555 $self->{'gs_doc_dir'} = $doc_dir;
556 $self->{'group_position'} = 1;
557 }
558 else {
559 $doc_dir = $self->{'gs_doc_dir'};
560 $self->{'new_doc_dir'} = 0;
561 }
562 return $doc_dir;
563
564}
565sub get_doc_dir {
566
567 my $self = shift (@_);
568 my ($doc_obj) = @_;
569
570 my $OID = $doc_obj->get_OID();
571 $OID = "NULL" unless defined $OID;
572
573 my $working_dir = $self->get_output_dir();
574 my $working_info = $self->{'output_info'};
575 return if (!defined $working_info);
576
577 my $doc_info = $working_info->get_info($OID);
578 my $doc_dir = '';
579
580 if (defined $doc_info && scalar(@$doc_info) >= 1)
581 {
582 # This OID already has an archives directory, so use it again
583 $doc_dir = $doc_info->[0];
584 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
585 }
586 elsif ($self->{'keep_import_structure'})
587 {
588 my $source_filename = $doc_obj->get_source_filename();
589 $source_filename = &File::Basename::dirname($source_filename);
590 $source_filename =~ s/[\\\/]+/\//g;
591 $source_filename =~ s/\/$//;
592
593 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
594 }
595
596 # We have to use a new archives directory for this document
597 if ($doc_dir eq "")
598 {
599 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
600 }
601
602 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
603
604 return $doc_dir;
605}
606
607
608## @function get_new_doc_dir()
609#
610# Once a doc object is ready to write to disk (and hence has a nice OID),
611# generate a unique subdirectory to write the information to.
612#
613# - create the directory as part of this call, to try and avoid race conditions
614# found in parallel processing [jmt12]
615#
616# @todo figure out what the rule regarding $work_info->size() is meant to do
617#
618# @todo determine what $self->{'group'} is, and whether it should affect
619# directory creation
620#
621sub get_new_doc_dir
622{
623 my $self = shift (@_);
624 my($working_info,$working_dir,$OID) = @_;
625
626 my $doc_dir = "";
627 my $doc_dir_rest = $OID;
628
629 # remove any \ and / from the OID
630 $doc_dir_rest =~ s/[\\\/]//g;
631
632 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
633 if ($ENV{'GSDLOS'} =~ /^windows$/i)
634 {
635 $doc_dir_rest =~ s/\://g;
636 }
637
638 # we generally create a unique directory by adding consequtive fragments of
639 # the document identifier (split by some predefined length - defaulting to
640 # 8) until we find a directory that doesn't yet exist. Note that directories
641 # that contain a document have a suffix ".dir" (whereas those that contain
642 # only subdirectories have no suffix).
643 my $doc_dir_num = 0; # how many directories deep we are
644 my $created_directory = 0; # have we successfully created a new directory
645 do
646 {
647 # (does this work on windows? - jmt12)
648 if ($doc_dir_num > 0)
649 {
650 $doc_dir .= '/';
651 }
652
653 # the default matching pattern grabs the next 'subdir_split_length'
654 # characters of the OID to act as the next subdirectory
655 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
656
657 # Do we count any "HASH" prefix against the split length limit?
658 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
659 {
660 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
661 }
662
663 # Note the use of 's' to both capture the next chuck of OID and to remove
664 # it from OID at the same time
665 if ($doc_dir_rest =~ s/$pattern//i)
666 {
667 $doc_dir .= $1;
668 $doc_dir_num++;
669
670 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
671 if(!FileUtils::directoryExists($full_doc_dir))
672 {
673 &FileUtils::makeAllDirectories($full_doc_dir);
674 $created_directory = 1;
675 }
676
677 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
678 ###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
679 ###rint STDERR " - rest: $doc_dir_rest\n";
680 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
681 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
682 }
683 }
684 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
685
686 # not unique yet? Add on an incremental suffix until we are unique
687 my $i = 1;
688 my $doc_dir_base = $doc_dir;
689 while ($created_directory == 0)
690 {
691 $doc_dir = $doc_dir_base . '-' . $i;
692 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
693 $i++;
694 }
695
696 # in theory this should never happen
697 if (!$created_directory)
698 {
699 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
700 }
701
702 return $doc_dir . '.dir';
703}
704## get_new_doc_dir()
705
706
707sub process_assoc_files {
708 my $self = shift (@_);
709 my ($doc_obj, $doc_dir, $handle) = @_;
710
711 my $outhandle = $self->{'output_handle'};
712
713 my $output_dir = $self->get_output_dir();
714 return if (!defined $output_dir);
715
716 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
717
718 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
719 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
720
721 my @assoc_files = ();
722 my $filename;;
723
724 my $source_filename = $doc_obj->get_source_filename();
725
726 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
727
728 if (defined $collect_dir) {
729 my $dirsep_regexp = &util::get_os_dirsep();
730
731 if ($collect_dir !~ /$dirsep_regexp$/) {
732 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
733 }
734
735 # This test is never going to fail on Windows -- is this a problem?
736
737 if ($source_filename !~ /^$dirsep_regexp/) {
738 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
739 }
740 }
741
742
743 # set the assocfile path (even if we have no assoc files - need this for lucene)
744 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
745 "assocfilepath",
746 "$doc_dir");
747 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
748 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
749 $dir = "" unless defined $dir;
750
751 my $utf8_real_filename = $assoc_file_rec->[0];
752
753 # for some reasons the image associate file has / before the full path
754 $utf8_real_filename =~ s/^\\(.*)/$1/i;
755
756## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
757 my $real_filename = $utf8_real_filename;
758 $real_filename = &util::downgrade_if_dos_filename($real_filename);
759
760 if (&FileUtils::fileExists($real_filename)) {
761
762 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
763
764 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
765
766 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
767 "gsdlassocfile",
768 "$afile:$assoc_file_rec->[2]:$dir");
769 } elsif ($self->{'verbosity'} > 1) {
770 print $outhandle "BasePlugout::process couldn't copy the associated file " .
771 "$real_filename to $afile\n";
772 }
773 }
774}
775
776
777sub process_metafiles_metadata
778{
779 my $self = shift (@_);
780 my ($doc_obj) = @_;
781
782 my $top_section = $doc_obj->get_top_section();
783 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
784
785 foreach my $metafile_pair (@$metafiles) {
786 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
787
788 $doc_obj->metadata_file($full_metafile,$metafile);
789 }
790
791 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
792}
793
794sub archiveinf_files_to_field
795{
796 my $self = shift(@_);
797 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
798
799 foreach my $file_rec (@$files) {
800 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
801 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
802 # for some reasons the image associate file has / before the full path
803 $real_filename =~ s/^\\(.*)/$1/i;
804
805 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
806
807 if (&FileUtils::fileExists($raw_filename)) {
808
809# if (defined $collect_dir) {
810# my $collect_dir_re_safe = $collect_dir;
811# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
812# $collect_dir_re_safe =~ s/\./\\./g;##
813
814# $real_filename =~ s/^$collect_dir_re_safe//;
815# }
816
817 if (defined $reverse_lookups) {
818 $reverse_lookups->{$real_filename} = 1;
819 }
820
821 if($field =~ m@assoc-file|src-file|meta-file@) {
822 $raw_filename = &util::abspath_to_placeholders($raw_filename);
823 }
824
825### push(@{$oid_files->{$field}},$full_file);
826 push(@{$oid_files->{$field}},$raw_filename);
827 }
828 else {
829 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
830 }
831 }
832}
833
834sub archiveinf_db
835{
836 my $self = shift (@_);
837 my ($doc_obj) = @_;
838
839 my $verbosity = $self->{'verbosity'};
840
841 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
842 if (defined $collect_dir) {
843 my $dirsep_regexp = &util::get_os_dirsep();
844
845 if ($collect_dir !~ /$dirsep_regexp$/) {
846 # ensure there is a slash at the end
847 $collect_dir .= &util::get_dirsep();
848 }
849 }
850
851 my $oid = $doc_obj->get_OID();
852 my $source_filename = $doc_obj->get_unmodified_source_filename();
853 my $working_info = $self->{'output_info'};
854 my $doc_info = $working_info->get_info($oid);
855
856 my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
857 # doc_file is the path to the archive doc.xml. Make sure it has unix
858 # slashes, then if the collection is copied to linux, it can be built without reimport
859 $doc_file =~ s/\\/\//g;
860 my $oid_files = { 'doc-file' => $doc_file,
861 'index-status' => $index_status,
862 'src-file' => $source_filename,
863 'sort-meta' => $sortmeta,
864 'assoc-file' => [],
865 'meta-file' => [] };
866 if (defined $group_position) {
867 $oid_files->{'group-position'} = $group_position;
868 }
869 my $reverse_lookups = { $source_filename => "1" };
870
871
872 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
873 $collect_dir,$oid_files,$reverse_lookups);
874
875
876 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
877 $collect_dir,$oid_files);
878
879 # Get the infodbtype value for this collection from the arcinfo object
880 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
881 my $output_dir = $self->{'output_dir'};
882
883 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
884
885 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
886
887 if (!$self->{'no_rss'})
888 {
889 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
890 my $top_section = $doc_obj->get_top_section();
891
892 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
893 my $dc_title = join("; ", @$dc_titles);
894
895 if ($oid_files->{'index-status'} eq "R") {
896 $dc_title .= " (Updated)";
897 }
898
899 my $rss_entry = "<item>\n";
900 $rss_entry .= " <title>$dc_title</title>\n";
901 $rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
902 $rss_entry .= "</item>";
903
904 if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
905 {
906 my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
907 my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
908 &dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
909 &dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
910 }
911 else
912 {
913 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
914 my $rss_fh;
915 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh))
916 {
917 print $rss_fh $rss_entry . "\n";
918 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
919 }
920 else
921 {
922 print STDERR "**** Failed to open $rss_filename\n$!\n";
923 }
924 }
925 }
926 }
927
928 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
929 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
930 $oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
931 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
932 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
933 if (defined $oid_files->{'group-position'}) {
934 $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
935 }
936
937 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
938 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
939 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
940
941 foreach my $rl (keys %$reverse_lookups) {
942 $working_info->add_reverseinfo($rl,$oid);
943 }
944
945 # meta files not set in reverese entry, but need to set the metadata flag
946 if (defined $doc_obj->get_meta_files()) {
947 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
948 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
949 $working_info->set_meta_file_flag($full_file);
950 }
951 }
952}
953
954
955sub set_sortmeta {
956 my $self = shift (@_);
957 my ($sortmeta, $removeprefix, $removesuffix) = @_;
958
959 $self->{'sortmeta'} = $sortmeta;
960 if (defined ($removeprefix) && $removeprefix ) {
961 $removeprefix =~ s/^\^//; # don't need a leading ^
962 $self->{'removeprefix'} = $removeprefix;
963 }
964 if (defined ($removesuffix) && $removesuffix) {
965 $removesuffix =~ s/\$$//; # don't need a trailing $
966 $self->{'removesuffix'} = $removesuffix;
967 }
968}
969
970sub open_xslt_pipe
971{
972 my $self = shift @_;
973 my ($output_file_name, $xslt_file)=@_;
974
975 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
976
977 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
978
979 my $mapping_file_path = "";
980
981 if ($ENV{'GSDLOS'} eq "windows"){
982 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
983 # this file:/// bit didn't work for me on windows XP
984 #$xslt_file = "\"file:///".$xslt_file."\"";
985 #$mapping_file_path = "\"file:///";
986 }
987 else{
988 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
989 }
990
991
992 $java_class_path = "\"".$java_class_path."\"";
993
994 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
995
996 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
997 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
998 $cmd .= "-m $mapping_file_path";
999 }
1000
1001 open(*XMLWRITER, $cmd)
1002 or die "can't open pipe to xslt: $!";
1003
1004
1005 $self->{'xslt_writer'} = *XMLWRITER;
1006
1007 print XMLWRITER "<?DocStart?>\n";
1008 print XMLWRITER "$output_file_name\n";
1009
1010
1011 }
1012
1013
1014sub close_xslt_pipe
1015{
1016 my $self = shift @_;
1017
1018
1019 return unless defined $self->{'xslt_writer'} ;
1020
1021 my $xsltwriter = $self->{'xslt_writer'};
1022
1023 print $xsltwriter "<?DocEnd?>\n";
1024 close($xsltwriter);
1025
1026 undef $self->{'xslt_writer'};
1027
1028}
1029
1030
1031
1032#the subclass should implement this method if is_group method could return 1.
1033sub close_group_output{
1034 my $self = shift (@_);
1035}
1036
1037sub is_group {
1038 my $self = shift (@_);
1039 return 0;
1040}
1041
1042my $dc_set = { Title => 1,
1043 Creator => 1,
1044 Subject => 1,
1045 Description => 1,
1046 Publisher => 1,
1047 Contributor => 1,
1048 Date => 1,
1049 Type => 1,
1050 Format => 1,
1051 Identifier => 1,
1052 Source => 1,
1053 Language => 1,
1054 Relation => 1,
1055 Coverage => 1,
1056 Rights => 1};
1057
1058
1059# returns an XML representation of the dublin core metadata
1060# if dc meta is not found, try ex meta
1061# This method is not used by the DSpacePlugout, which has its
1062# own method to save its dc metadata
1063sub get_dc_metadata {
1064 my $self = shift(@_);
1065 my ($doc_obj, $section, $version) = @_;
1066
1067 # build up string of dublin core metadata
1068 $section="" unless defined $section;
1069
1070 my $section_ptr = $doc_obj->_lookup_section($section);
1071 return "" unless defined $section_ptr;
1072
1073
1074 my $explicit_dc = {};
1075 my $explicit_ex_dc = {};
1076 my $explicit_ex = {};
1077
1078 my $all_text="";
1079
1080 # We want high quality dc metadata to go in first, so we store all the
1081 # assigned dc.* values first. Then, for all those dc metadata names in
1082 # the official dc set that are as yet unassigned, we look to see whether
1083 # embedded ex.dc.* metadata has defined some values for them. If not,
1084 # then for the same missing dc metadata names, we look in ex metadata.
1085
1086 foreach my $data (@{$section_ptr->{'metadata'}}){
1087 my $escaped_value = &docprint::escape_text($data->[1]);
1088 if ($data->[0]=~ m/^dc\./) {
1089 $data->[0] =~ tr/[A-Z]/[a-z]/;
1090
1091 $data->[0] =~ m/^dc\.(.*)/;
1092 my $dc_element = $1;
1093
1094 if (!defined $explicit_dc->{$dc_element}) {
1095 $explicit_dc->{$dc_element} = [];
1096 }
1097 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1098
1099 if (defined $version && ($version eq "oai_dc")) {
1100 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1101 }
1102 else {
1103 # qualifier???
1104 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1105 }
1106
1107 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1108 $data->[0] =~ m/^ex\.dc\.(.*)/;
1109 my $ex_dc_element = $1;
1110 my $lc_ex_dc_element = lc($ex_dc_element);
1111
1112 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1113 if (defined $dc_set->{$ex_dc_element}) {
1114 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1115 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1116 }
1117 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1118 }
1119 }
1120 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1121 $data->[0] =~ m/^(ex\.)?(.*)/;
1122 my $ex_element = $2;
1123 my $lc_ex_element = lc($ex_element);
1124
1125 if (defined $dc_set->{$ex_element}) {
1126 if (!defined $explicit_ex->{$lc_ex_element}) {
1127 $explicit_ex->{$lc_ex_element} = [];
1128 }
1129 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1130 }
1131 }
1132 }
1133
1134 # go through dc_set and for any element *not* defined in explicit_dc
1135 # that does exist in explicit_ex, add it in as metadata
1136 foreach my $k ( keys %$dc_set ) {
1137 my $lc_k = lc($k);
1138
1139 if (!defined $explicit_dc->{$lc_k}) {
1140 # try to find if ex.dc.* defines this dc.* meta,
1141 # if not, then look for whether there's an ex.* equivalent
1142
1143 if (defined $explicit_ex_dc->{$lc_k}) {
1144 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1145 my $dc_element = $lc_k;
1146 my $escaped_value = $v;
1147
1148 if (defined $version && ($version eq "oai_dc")) {
1149 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1150 }
1151 else {
1152 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1153 }
1154 }
1155 } elsif (defined $explicit_ex->{$lc_k}) {
1156 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1157 my $dc_element = $lc_k;
1158 my $escaped_value = $v;
1159
1160 if (defined $version && ($version eq "oai_dc")) {
1161 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1162 }
1163 else {
1164 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1165 }
1166 }
1167 }
1168 }
1169 }
1170
1171 if ($all_text eq "") {
1172 $all_text .= " There is no Dublin Core metatdata in this document\n";
1173 }
1174 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1175
1176 return $all_text;
1177}
1178
1179# Build up dublin_core metadata. Priority given to dc.* over ex.*
1180# This method was apparently added by Jeffrey and committed by Shaoqun.
1181# But we don't know why it was added, so not using it anymore.
1182sub new_get_dc_metadata {
1183
1184 my $self = shift(@_);
1185 my ($doc_obj, $section, $version) = @_;
1186
1187 # build up string of dublin core metadata
1188 $section="" unless defined $section;
1189
1190 my $section_ptr=$doc_obj->_lookup_section($section);
1191 return "" unless defined $section_ptr;
1192
1193 my $all_text = "";
1194 foreach my $data (@{$section_ptr->{'metadata'}}){
1195 my $escaped_value = &docprint::escape_text($data->[1]);
1196 my $dc_element = $data->[0];
1197
1198 my @array = split('\.',$dc_element);
1199 my ($type,$name);
1200
1201 if(defined $array[1])
1202 {
1203 $type = $array[0];
1204 $name = $array[1];
1205 }
1206 else
1207 {
1208 $type = "ex";
1209 $name = $array[0];
1210 }
1211
1212 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1213 }
1214 return $all_text;
1215}
1216
1217
12181;
Note: See TracBrowser for help on using the repository browser.