source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27500

Last change on this file since 27500 was 27500, checked in by jmt12, 11 years ago

Missed an old style file open that instead needs to go through FileUtils::openFileHandle()

  • Property svn:keywords set to Author Date Id Revision
File size: 32.9 KB
RevLine 
[12330]1###########################################################################
2#
[17202]3# BasePlugout.pm -- base class for all the plugout modules
[12330]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[17202]26package BasePlugout;
[12330]27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
[12459]32no strict 'refs';
[12330]33
[21565]34use dbutil;
[12330]35use gsprintf 'gsprintf';
36use printusage;
[12546]37use parse2;
[27306]38use util;
39use FileUtils;
[12330]40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
[13172]46 { 'name' => "group_size",
[12330]47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
[20320]61 'deft' => "",
[12330]62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
[13172]79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
[27499]84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "no",
96 'hiddengli' => "no"}
[12330]97];
98
[17202]99my $options = { 'name' => "BasePlugout",
[12330]100 'desc' => "{BasPlugout.desc}",
101 'abstract' => "yes",
102 'inherits' => "no",
103 'args' => $arguments};
104
105sub new
106{
107 my $class = shift (@_);
108
109 my ($plugoutlist,$args,$hashArgOptLists) = @_;
110 push(@$plugoutlist, $class);
111
[20320]112 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
[12330]113
[17202]114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
[12330]116
117 my $self = {};
118 $self->{'plugout_type'} = $class;
119 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
120 $self->{"info_only"} = 0;
121
122 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
123 # the args, just return the object.
124 foreach my $strArg (@{$args})
125 {
126 if(defined $strArg && $strArg eq "-gsdlinfo")
127 {
128 $self->{"info_only"} = 1;
129 return bless $self, $class;
130 }
131 }
132
133 delete $self->{"info_only"};
134
[12546]135 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[12330]136 {
137 my $classTempClass = bless $self, $class;
[20320]138 print STDERR "<BadPlugout d=$plugout_name>\n";
139 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
[12330]140 $classTempClass->print_txt_usage(""); # Use default resource bundle
141 die "\n";
142 }
143
144
[12603]145 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
[12330]146 {
[20320]147 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
148 if (!defined $full_file_path) {
149 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
150 die "\n";
151 }
152 $self->{'xslt_file'} = $full_file_path;
[12330]153 }
154
155 $self->{'gs_count'} = 0;
156
[12459]157 $self->{'keep_import_structure'} = 0;
[12330]158
159 return bless $self, $class;
160
161}
162
163sub print_xml_usage
164{
165 my $self = shift(@_);
166 my $header = shift(@_);
[12628]167 my $high_level_information_only = shift(@_);
[12330]168
169 # XML output is always in UTF-8
170 gsprintf::output_strings_in_UTF8;
171
172 if ($header) {
173 &PrintUsage::print_xml_header("plugout");
174 }
[12628]175 $self->print_xml($high_level_information_only);
[12330]176}
177
178
179sub print_xml
180{
181 my $self = shift(@_);
[12628]182 my $high_level_information_only = shift(@_);
183
[12330]184 my $optionlistref = $self->{'option_list'};
185 my @optionlist = @$optionlistref;
186 my $plugoutoptions = shift(@$optionlistref);
187 return if (!defined($plugoutoptions));
188
189 gsprintf(STDERR, "<PlugoutInfo>\n");
190 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
191 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
192 $desc =~ s/</&amp;lt;/g; # doubly escaped
193 $desc =~ s/>/&amp;gt;/g;
194 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
195 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
196 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
[12628]197 unless (defined($high_level_information_only)) {
198 gsprintf(STDERR, " <Arguments>\n");
199 if (defined($plugoutoptions->{'args'})) {
200 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
201 }
202 gsprintf(STDERR, " </Arguments>\n");
[12330]203
[12628]204 # Recurse up the plugout hierarchy
205 $self->print_xml();
[12330]206 }
207 gsprintf(STDERR, "</PlugoutInfo>\n");
208}
209
210
211sub print_txt_usage
212{
213 my $self = shift(@_);
214
215 # Print the usage message for a plugout (recursively)
216 my $descoffset = $self->determine_description_offset(0);
217 $self->print_plugout_usage($descoffset, 1);
218}
219
220sub determine_description_offset
221{
222 my $self = shift(@_);
223 my $maxoffset = shift(@_);
224
225 my $optionlistref = $self->{'option_list'};
226 my @optionlist = @$optionlistref;
227 my $plugoutoptions = pop(@$optionlistref);
228 return $maxoffset if (!defined($plugoutoptions));
229
230 # Find the length of the longest option string of this download
231 my $plugoutargs = $plugoutoptions->{'args'};
232 if (defined($plugoutargs)) {
233 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
234 if ($longest > $maxoffset) {
235 $maxoffset = $longest;
236 }
237 }
238
239 # Recurse up the download hierarchy
240 $maxoffset = $self->determine_description_offset($maxoffset);
241 $self->{'option_list'} = \@optionlist;
242 return $maxoffset;
243}
244
245
246sub print_plugout_usage
247{
248 my $self = shift(@_);
249 my $descoffset = shift(@_);
250 my $isleafclass = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $plugoutoptions = shift(@$optionlistref);
255 return if (!defined($plugoutoptions));
256
257 my $plugoutname = $plugoutoptions->{'name'};
258 my $plugoutargs = $plugoutoptions->{'args'};
259 my $plugoutdesc = $plugoutoptions->{'desc'};
260
261 # Produce the usage information using the data structure above
262 if ($isleafclass) {
263 if (defined($plugoutdesc)) {
264 gsprintf(STDERR, "$plugoutdesc\n\n");
265 }
266 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
267 }
268
269 # Display the download options, if there are some
270 if (defined($plugoutargs)) {
271 # Calculate the column offset of the option descriptions
272 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
273
274 if ($isleafclass) {
275 gsprintf(STDERR, " {common.specific_options}:\n");
276 }
277 else {
278 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
279 }
280
281 # Display the download options
282 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
283 }
284
285 # Recurse up the download hierarchy
286 $self->print_plugout_usage($descoffset, 0);
287 $self->{'option_list'} = \@optionlist;
288}
289
290
291sub error
292{
293 my ($strFunctionName,$strError) = @_;
294 {
[17202]295 print "Error occoured in BasePlugout.pm\n".
[12330]296 "In Function: ".$strFunctionName."\n".
297 "Error Message: ".$strError."\n";
298 exit(-1);
299 }
300}
301
[27352]302# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
[12330]303sub set_OIDtype {
304 my $self = shift (@_);
[12618]305 my ($type, $metadata) = @_;
[12330]306
[27352]307 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
[12330]308 $self->{'OIDtype'} = $type;
309 } else {
310 $self->{'OIDtype'} = "hash";
311 }
[12618]312 if ($type =~ /^assigned$/) {
313 if (defined $metadata) {
314 $self->{'OIDmetadata'} = $metadata;
315 } else {
316 $self->{'OIDmetadata'} = "dc.Identifier";
317 }
318 }
[12330]319}
320
321sub set_output_dir
322{
323 my $self = shift @_;
324 my ($output_dir) = @_;
325
326 $self->{'output_dir'} = $output_dir;
327}
328
329sub setoutputdir
330{
331 my $self = shift @_;
332 my ($output_dir) = @_;
333
334 $self->{'output_dir'} = $output_dir;
335}
336
337sub get_output_dir
338{
339 my $self = shift (@_);
340
341 return $self->{'output_dir'};
342}
343
344sub getoutputdir
345{
346 my $self = shift (@_);
347
348 return $self->{'output_dir'};
349}
350
351sub getoutputinfo
352{
353 my $self = shift (@_);
354
355 return $self->{'output_info'};
356}
357
358
359sub get_output_handler
360{
361 my $self = shift (@_);
362
363 my ($output_file_name) = @_;
364
[27500]365 my $fh;
366 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
367
368 return $fh;
[12330]369}
370
371sub release_output_handler
372{
373 my $self = shift (@_);
374 my ($outhandler) = @_;
375
376 close($outhandler);
377
378}
379
380sub output_xml_header {
381 my $self = shift (@_);
382 my ($handle,$docroot,$nondoctype) = @_;
383
[23824]384
385 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
386
387 #For Dspace must be UTF in lower case
388 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
[12330]389
390 if (!defined $nondoctype){
[22818]391 my $doctype = (defined $docroot) ? $docroot : "Section";
392
393 # Used to be '<!DOCTYPE Archive SYSTEM ...'
394
395 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
[12330]396 }
397
398 print $handle "<$docroot>\n" if defined $docroot;
399}
400
401sub output_xml_footer {
402 my $self = shift (@_);
403 my ($handle,$docroot) = @_;
404 print $handle "</$docroot>\n" if defined $docroot;
405}
406
[23824]407
408sub output_general_xml_header
409{
410 my $self = shift (@_);
411 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
412
413 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
414
415 if (defined $opt_dtd) {
416 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
417 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
418 }
419
420 if (defined $docroot) {
421 my $full_docroot = $docroot;
422 if (defined $opt_attributes) {
423 $full_docroot .= " $opt_attributes";
424 }
425
426 print $handle "<$full_docroot>\n"
427 }
428}
429
430sub output_general_xml_footer
431{
432 output_xml_footer(@_);
433}
434
435
[12330]436sub process {
437 my $self = shift (@_);
438 my ($doc_obj) = @_;
[12459]439
[23939]440 # for OAI purposes
[12330]441 $doc_obj->set_lastmodified();
[23939]442 $doc_obj->set_oailastmodified();
[12330]443
444 if ($self->{'group_size'} > 1) {
[12459]445 $self->group_process ($doc_obj);
[12330]446 return;
447 }
448
449 my $OID = $doc_obj->get_OID();
450 $OID = "NULL" unless defined $OID;
451
452 my $top_section = $doc_obj->get_top_section();
453
454 #get document's directory
455 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
456
457 my $output_info = $self->{'output_info'};
458 return if (!defined $output_info);
459
460 ##############################
461 # call subclass' saveas method
462 ##############################
463 $self->saveas($doc_obj,$doc_dir);
[21564]464 $self->archiveinf_db($doc_obj,$doc_dir);
[17087]465
[12330]466}
467
[12363]468sub store_output_info_reference {
[12330]469 my $self = shift (@_);
470 my ($doc_obj) = @_;
471
[12363]472 my $output_info = $self->{'output_info'};
473 my $metaname = $self->{'sortmeta'};
474 if (!defined $metaname || $metaname !~ /\S/) {
475 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
476 return;
477 }
[12330]478
[12363]479 my $metadata = "";
480 my $top_section = $doc_obj->get_top_section();
481
482 my @commameta_list = split(/,/, $metaname);
483 foreach my $cmn (@commameta_list) {
484 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
485 if ($meta) {
486 # do remove prefix/suffix - this will apply to all values
487 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
488 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
489 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
490 $metadata .= $meta if ($meta);
491 }
[12330]492 }
493
494 # store reference in the output_info
495 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
496
497}
498
499sub group_process {
500
501 my $self = shift (@_);
502 my ($doc_obj) = @_;
503
504 my $OID = $doc_obj->get_OID();
505 $OID = "NULL" unless defined $OID;
506
507 my $groupsize = $self->{'group_size'};
508 my $gs_count = $self->{'gs_count'};
509 my $open_new_file = (($gs_count % $groupsize)==0);
510 my $outhandle = $self->{'output_handle'};
511
512 # opening a new file, or document has assoicated files => directory needed
[12459]513 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
514
[12330]515 # The directory the archive file (doc.xml) and all associated files
516 # should end up in
517 my $doc_dir;
518 # If we've determined its time for a new file, open it now
519 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
520 {
521 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
522 # only if opening new file
523 my $output_dir = $self->get_output_dir();
[12459]524 &util::mk_all_dir ($output_dir) unless -e $output_dir;
[27306]525 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
526 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
[12459]527
[12330]528 if ($gs_count>0)
529 {
530 return if (!$self->close_file_output());
531 }
532
[17202]533 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
[12330]534
535
536 $self->{'gs_filename'} = $doc_file;
[12459]537 $self->{'short_doc_file'} = $short_doc_file;
[12330]538 $self->{'gs_OID'} = $OID;
539 $self->{'gs_doc_dir'} = $doc_dir;
540
[17202]541 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
[12330]542 }
543 # Otherwise load the same archive document directory used last time
544 else
545 {
546 $doc_dir = $self->{'gs_doc_dir'};
547 }
548
549 # copy all the associated files, add this information as metadata
550 # to the document
[13172]551 print $outhandle "Writing associated files to $doc_dir\n";
[12330]552 $self->process_assoc_files ($doc_obj, $doc_dir);
[19494]553
554 # look up 'gsdlmetafile' metadata and store that information
555 # explicitly in $doc_obj
556 $self->process_metafiles_metadata ($doc_obj);
[12330]557 }
558
559 # save this document
[13172]560 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
561 print GROUPPROCESS $section_text;
[12330]562
563 $self->{'gs_count'}++;
564}
565
566
567sub saveas {
568 my $self = shift (@_);
569
570 die "Basplug::saveas function must be implemented in sub classes\n";
571}
572
573sub get_doc_dir {
574 my $self = shift (@_);
575 my ($OID, $source_filename) = @_;
576
577 my $working_dir = $self->get_output_dir();
[19775]578 my $working_info = $self->{'output_info'};
[12330]579 return if (!defined $working_info);
580
581 my $doc_info = $working_info->get_info($OID);
582 my $doc_dir = '';
583
[16252]584 if (defined $doc_info && scalar(@$doc_info) >= 1)
585 {
586 # This OID already has an archives directory, so use it again
[12330]587 $doc_dir = $doc_info->[0];
588 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
[16252]589 }
590 elsif ($self->{'keep_import_structure'})
591 {
[12330]592 $source_filename = &File::Basename::dirname($source_filename);
593 $source_filename =~ s/[\\\/]+/\//g;
594 $source_filename =~ s/\/$//;
595
596 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
[16252]597 }
[12330]598
[16252]599 # We have to use a new archives directory for this document
600 if ($doc_dir eq "")
601 {
602 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
[12330]603 }
604
[12603]605 if (!defined $self->{'group'} || !$self->{'group'}){
[27306]606 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
[12603]607 }
[16252]608
[12330]609 return $doc_dir;
610}
611
612sub get_new_doc_dir{
613 my $self = shift (@_);
614 my($working_info,$working_dir,$OID) = @_;
615
[19180]616
[12330]617 my $doc_dir = "";
618 my $doc_dir_rest = $OID;
[22232]619
[19180]620 # remove any \ and / from the OID
621 $doc_dir_rest =~ s/[\\\/]//g;
[22232]622
623 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
624 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
625
[12330]626 my $doc_dir_num = 0;
627
628 do {
629 $doc_dir .= "/" if $doc_dir_num > 0;
[27499]630 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
631 if ($self->{'subdir_hash_prefix'})
632 {
633 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
634 }
635 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
636 if ($doc_dir_rest =~ s/$pattern//i)
637 {
[12330]638 $doc_dir .= $1;
639 $doc_dir_num++;
640 }
641 } while ($doc_dir_rest ne "" &&
[27306]642 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
[12330]643 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
[17294]644 my $i = 1;
645 my $doc_dir_base = $doc_dir;
[27306]646 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
[17294]647 $doc_dir = "$doc_dir_base-$i";
648 $i++;
649 }
650
[16252]651 return "$doc_dir.dir";
[12330]652}
653
654sub process_assoc_files {
655 my $self = shift (@_);
656 my ($doc_obj, $doc_dir, $handle) = @_;
657
658 my $outhandle = $self->{'output_handle'};
659
660 my $output_dir = $self->get_output_dir();
661 return if (!defined $output_dir);
662
663 &util::mk_all_dir ($output_dir) unless -e $output_dir;
664
[27306]665 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
[12330]666 &util::mk_all_dir ($working_dir) unless -e $working_dir;
667
668 my @assoc_files = ();
669 my $filename;;
670
671 my $source_filename = $doc_obj->get_source_filename();
672
673 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
674
675 if (defined $collect_dir) {
676 my $dirsep_regexp = &util::get_os_dirsep();
677
678 if ($collect_dir !~ /$dirsep_regexp$/) {
679 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
680 }
681
682 # This test is never going to fail on Windows -- is this a problem?
683
684 if ($source_filename !~ /^$dirsep_regexp/) {
[27306]685 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
[12330]686 }
687 }
688
689
690 # set the assocfile path (even if we have no assoc files - need this for lucene)
691 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
692 "assocfilepath",
693 "$doc_dir");
694 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
695 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
696 $dir = "" unless defined $dir;
697
[23363]698 my $utf8_real_filename = $assoc_file_rec->[0];
699
[12330]700 # for some reasons the image associate file has / before the full path
[23363]701 $utf8_real_filename =~ s/^\\(.*)/$1/i;
702
[23387]703## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
704 my $real_filename = $utf8_real_filename;
705 $real_filename = &util::downgrade_if_dos_filename($real_filename);
[23363]706
[12330]707 if (-e $real_filename) {
708
[27306]709 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
[12330]710
[27306]711 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
712
[12330]713 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
714 "gsdlassocfile",
715 "$afile:$assoc_file_rec->[2]:$dir");
[23363]716 } elsif ($self->{'verbosity'} > 1) {
[17202]717 print $outhandle "BasePlugout::process couldn't copy the associated file " .
[12330]718 "$real_filename to $afile\n";
719 }
720 }
721}
722
[17087]723
[19494]724sub process_metafiles_metadata
725{
726 my $self = shift (@_);
727 my ($doc_obj) = @_;
728
729 my $top_section = $doc_obj->get_top_section();
730 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
731
732 foreach my $metafile_pair (@$metafiles) {
[19516]733 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
[19494]734
735 $doc_obj->metadata_file($full_metafile,$metafile);
736 }
737
738 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
739}
740
741sub archiveinf_files_to_field
742{
743 my $self = shift(@_);
744 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
745
746 foreach my $file_rec (@$files) {
[20777]747 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
748 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
[19494]749 # for some reasons the image associate file has / before the full path
750 $real_filename =~ s/^\\(.*)/$1/i;
751
[23387]752 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
[23363]753
754 if (-e $raw_filename) {
755
[20763]756# if (defined $collect_dir) {
757# my $collect_dir_re_safe = $collect_dir;
[24829]758# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
[20763]759# $collect_dir_re_safe =~ s/\./\\./g;##
[19494]760
[20763]761# $real_filename =~ s/^$collect_dir_re_safe//;
762# }
[22328]763
[20801]764 if (defined $reverse_lookups) {
765 $reverse_lookups->{$real_filename} = 1;
766 }
[23363]767### push(@{$oid_files->{$field}},$full_file);
768 push(@{$oid_files->{$field}},$raw_filename);
[19494]769 }
770 else {
[19516]771 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
[19494]772 }
773 }
774}
775
[21564]776sub archiveinf_db
[17087]777{
778 my $self = shift (@_);
779 my ($doc_obj) = @_;
780
781 my $verbosity = $self->{'verbosity'};
782
783 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
784 if (defined $collect_dir) {
785 my $dirsep_regexp = &util::get_os_dirsep();
786
787 if ($collect_dir !~ /$dirsep_regexp$/) {
788 # ensure there is a slash at the end
789 $collect_dir .= &util::get_dirsep();
790 }
791 }
792
793 my $oid = $doc_obj->get_OID();
[19829]794 my $source_filename = $doc_obj->get_unmodified_source_filename();
[18441]795 my $working_info = $self->{'output_info'};
796 my $doc_info = $working_info->get_info($oid);
[20651]797
[20747]798 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
[22328]799 # doc_file is the path to the archive doc.xml. Make sure it has unix
800 # slashes, then if the collection is copied to linux, it can be built without reimport
801 $doc_file =~ s/\\/\//g;
[18441]802 my $oid_files = { 'doc-file' => $doc_file,
803 'index-status' => $index_status,
804 'src-file' => $source_filename,
[20747]805 'sort-meta' => $sortmeta,
[19775]806 'assoc-file' => [],
807 'meta-file' => [] };
[17087]808
[19494]809 my $reverse_lookups = { $source_filename => "1" };
[17087]810
811
[20777]812 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
[19494]813 $collect_dir,$oid_files,$reverse_lookups);
[17087]814
[17120]815
[19775]816 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
[20801]817 $collect_dir,$oid_files);
[17087]818
[21584]819 # Get the infodbtype value for this collection from the arcinfo object
820 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
[17087]821 my $output_dir = $self->{'output_dir'};
822
[21584]823 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
824 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
[17087]825
[18528]826 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
[17087]827
[24958]828 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
829 my $top_section = $doc_obj->get_top_section();
830
[27071]831 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
[24958]832 my $dc_title = join("; ", @$dc_titles);
833
834 if ($oid_files->{'index-status'} eq "R") {
835 $dc_title .= " (Updated)";
836 }
837
[27306]838 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
[24958]839 if (open(RSSOUT,">>$rss_filename")) {
840 print RSSOUT "<item>\n";
841 print RSSOUT " <title>$dc_title</title>\n";
842 print RSSOUT " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
843 print RSSOUT "</item>\n";
844 close(RSSOUT);
845 }
846 else {
847 print STDERR "**** Failed to open $rss_filename\n!$\n";
848 }
849
850
851 }
852
[19775]853 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
854 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
855 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
[20747]856 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
[19775]857
[21584]858 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
859 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
860 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
[19775]861
[19494]862 foreach my $rl (keys %$reverse_lookups) {
[19775]863 $working_info->add_reverseinfo($rl,$oid);
[20801]864 }
865
866 # meta files not set in reverese entry, but need to set the metadata flag
[20814]867 if (defined $doc_obj->get_meta_files()) {
868 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
869 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
870 $working_info->set_meta_file_flag($full_file);
871 }
[20801]872 }
[17087]873}
874
875
[12330]876sub set_sortmeta {
877 my $self = shift (@_);
878 my ($sortmeta, $removeprefix, $removesuffix) = @_;
879
880 $self->{'sortmeta'} = $sortmeta;
881 if (defined ($removeprefix) && $removeprefix ) {
882 $removeprefix =~ s/^\^//; # don't need a leading ^
883 $self->{'removeprefix'} = $removeprefix;
884 }
885 if (defined ($removesuffix) && $removesuffix) {
886 $removesuffix =~ s/\$$//; # don't need a trailing $
887 $self->{'removesuffix'} = $removesuffix;
888 }
889}
890
891sub open_xslt_pipe
892{
893 my $self = shift @_;
894 my ($output_file_name, $xslt_file)=@_;
895
[20320]896 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
897
[27306]898 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
[13013]899
[15140]900 my $mapping_file_path = "";
901
[13225]902 if ($ENV{'GSDLOS'} eq "windows"){
[27306]903 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
[20320]904 # this file:/// bit didn't work for me on windows XP
905 #$xslt_file = "\"file:///".$xslt_file."\"";
906 #$mapping_file_path = "\"file:///";
[13225]907 }
908 else{
[27306]909 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
[13225]910 }
911
912
[13013]913 $java_class_path = "\"".$java_class_path."\"";
[13024]914
[20902]915 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
[12330]916
[12603]917 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
[13064]918 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
[14969]919 $cmd .= "-m $mapping_file_path";
[12603]920 }
[20320]921
[12330]922 open(*XMLWRITER, $cmd)
923 or die "can't open pipe to xslt: $!";
924
925
926 $self->{'xslt_writer'} = *XMLWRITER;
927
928 print XMLWRITER "<?DocStart?>\n";
929 print XMLWRITER "$output_file_name\n";
[14969]930
[12330]931
932 }
933
934
935sub close_xslt_pipe
936{
937 my $self = shift @_;
938
939
940 return unless defined $self->{'xslt_writer'} ;
941
942 my $xsltwriter = $self->{'xslt_writer'};
943
944 print $xsltwriter "<?DocEnd?>\n";
945 close($xsltwriter);
[13024]946
947 undef $self->{'xslt_writer'};
948
[12330]949}
950
951sub close_file_output
952{
953 my ($self) = @_;
954
955 # make sure that the handle has been opened - it won't be if we failed
956 # to import any documents...
957 if (defined(fileno(GROUPPROCESS))) {
958 $self->output_xml_footer('GROUPPROCESS','Archive');
959 close GROUPPROCESS;
960 }
961
962 my $OID = $self->{'gs_OID'};
963 my $short_doc_file = $self->{'short_doc_file'};
964
965 if ($self->{'gzip'}) {
966 my $doc_file = $self->{'gs_filename'};
967 `gzip $doc_file`;
968 $doc_file .= ".gz";
969 $short_doc_file .= ".gz";
970 if (!-e $doc_file) {
971 my $outhandle = $self->{'output_handle'};
972 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
973 return 0;
974 }
975 }
976
977 # store reference in output_info
978 my $output_info = $self->{'output_info'};
979 return 0 if (!defined $output_info);
980 $output_info->add_info($OID, $short_doc_file, undef, undef);
981 return 1;
982}
983
[13172]984
[12330]985#the subclass should implement this method if is_group method could return 1.
986sub close_group_output{
987 my $self = shift (@_);
988}
989
990sub is_group {
991 my $self = shift (@_);
992 return 0;
993}
994
[13172]995my $dc_set = { Title => 1,
996 Creator => 1,
997 Subject => 1,
998 Description => 1,
999 Publisher => 1,
1000 Contributor => 1,
1001 Date => 1,
1002 Type => 1,
1003 Format => 1,
1004 Identifier => 1,
1005 Source => 1,
1006 Language => 1,
1007 Relation => 1,
1008 Coverage => 1,
1009 Rights => 1};
1010
1011
1012# returns an XML representation of the dublin core metadata
[24404]1013# if dc meta is not found, try ex meta
1014# This method is not used by the DSpacePlugout, which has its
1015# own method to save its dc metadata
[13172]1016sub get_dc_metadata {
1017 my $self = shift(@_);
1018 my ($doc_obj, $section, $version) = @_;
1019
1020 # build up string of dublin core metadata
1021 $section="" unless defined $section;
1022
1023 my $section_ptr = $doc_obj->_lookup_section($section);
1024 return "" unless defined $section_ptr;
1025
1026
1027 my $explicit_dc = {};
[24404]1028 my $explicit_ex_dc = {};
[13172]1029 my $explicit_ex = {};
1030
1031 my $all_text="";
[24404]1032
1033 # We want high quality dc metadata to go in first, so we store all the
1034 # assigned dc.* values first. Then, for all those dc metadata names in
1035 # the official dc set that are as yet unassigned, we look to see whether
1036 # embedded ex.dc.* metadata has defined some values for them. If not,
1037 # then for the same missing dc metadata names, we look in ex metadata.
1038
[13172]1039 foreach my $data (@{$section_ptr->{'metadata'}}){
1040 my $escaped_value = &docprint::escape_text($data->[1]);
1041 if ($data->[0]=~ m/^dc\./) {
1042 $data->[0] =~ tr/[A-Z]/[a-z]/;
1043
1044 $data->[0] =~ m/^dc\.(.*)/;
1045 my $dc_element = $1;
1046
1047 if (!defined $explicit_dc->{$dc_element}) {
1048 $explicit_dc->{$dc_element} = [];
1049 }
1050 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1051
1052 if (defined $version && ($version eq "oai_dc")) {
1053 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1054 }
1055 else {
1056 # qualifier???
1057 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1058 }
1059
[24404]1060 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1061 $data->[0] =~ m/^ex\.dc\.(.*)/;
1062 my $ex_dc_element = $1;
1063 my $lc_ex_dc_element = lc($ex_dc_element);
1064
1065 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1066 if (defined $dc_set->{$ex_dc_element}) {
1067 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1068 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1069 }
1070 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1071 }
1072 }
1073 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
[13172]1074 $data->[0] =~ m/^(ex\.)?(.*)/;
[24404]1075 my $ex_element = $2;
[13172]1076 my $lc_ex_element = lc($ex_element);
1077
1078 if (defined $dc_set->{$ex_element}) {
1079 if (!defined $explicit_ex->{$lc_ex_element}) {
1080 $explicit_ex->{$lc_ex_element} = [];
1081 }
1082 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1083 }
1084 }
1085 }
1086
1087 # go through dc_set and for any element *not* defined in explicit_dc
[14932]1088 # that does exist in explicit_ex, add it in as metadata
[13172]1089 foreach my $k ( keys %$dc_set ) {
1090 my $lc_k = lc($k);
1091
1092 if (!defined $explicit_dc->{$lc_k}) {
[24404]1093 # try to find if ex.dc.* defines this dc.* meta,
1094 # if not, then look for whether there's an ex.* equivalent
[13172]1095
[24404]1096 if (defined $explicit_ex_dc->{$lc_k}) {
1097 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1098 my $dc_element = $lc_k;
1099 my $escaped_value = $v;
1100
1101 if (defined $version && ($version eq "oai_dc")) {
1102 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1103 }
1104 else {
1105 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1106 }
1107 }
1108 } elsif (defined $explicit_ex->{$lc_k}) {
[13172]1109 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1110 my $dc_element = $lc_k;
1111 my $escaped_value = $v;
1112
1113 if (defined $version && ($version eq "oai_dc")) {
1114 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1115 }
1116 else {
1117 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1118 }
1119 }
1120 }
1121 }
1122 }
1123
1124 if ($all_text eq "") {
1125 $all_text .= " There is no Dublin Core metatdata in this document\n";
1126 }
1127 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1128
1129 return $all_text;
1130}
1131
1132# Build up dublin_core metadata. Priority given to dc.* over ex.*
1133# This method was apparently added by Jeffrey and committed by Shaoqun.
1134# But we don't know why it was added, so not using it anymore.
1135sub new_get_dc_metadata {
1136
1137 my $self = shift(@_);
1138 my ($doc_obj, $section, $version) = @_;
1139
1140 # build up string of dublin core metadata
1141 $section="" unless defined $section;
1142
1143 my $section_ptr=$doc_obj->_lookup_section($section);
1144 return "" unless defined $section_ptr;
1145
1146 my $all_text = "";
1147 foreach my $data (@{$section_ptr->{'metadata'}}){
1148 my $escaped_value = &docprint::escape_text($data->[1]);
1149 my $dc_element = $data->[0];
1150
1151 my @array = split('\.',$dc_element);
1152 my ($type,$name);
1153
1154 if(defined $array[1])
1155 {
1156 $type = $array[0];
1157 $name = $array[1];
1158 }
1159 else
1160 {
1161 $type = "ex";
1162 $name = $array[0];
1163 }
1164
1165 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1166 }
1167 return $all_text;
1168}
1169
1170
[12330]11711;
Note: See TracBrowser for help on using the repository browser.