source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 26536

Last change on this file since 26536 was 26536, checked in by davidb, 11 years ago

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

  • Property svn:keywords set to Author Date Id Revision
File size: 32.0 KB
RevLine 
[12330]1###########################################################################
2#
[17202]3# BasePlugout.pm -- base class for all the plugout modules
[12330]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[17202]26package BasePlugout;
[12330]27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
[12459]32no strict 'refs';
[12330]33
[21565]34use dbutil;
[12330]35use gsprintf 'gsprintf';
36use printusage;
[12546]37use parse2;
[12330]38
[17087]39
[12330]40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
[13172]45 { 'name' => "group_size",
[12330]46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
[20320]60 'deft' => "",
[12330]61 'hiddengli' => "no"},
62 { 'name' => "output_handle",
63 'desc' => "{BasPlugout.output_handle}",
64 'type' => "string",
65 'deft' => 'STDERR',
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "verbosity",
69 'desc' => "{BasPlugout.verbosity}",
70 'type' => "int",
71 'deft' => "0",
72 'reqd' => "no",
73 'hiddengli' => "no"},
74 { 'name' => "gzip_output",
75 'desc' => "{BasPlugout.gzip_output}",
76 'type' => "flag",
77 'reqd' => "no",
[13172]78 'hiddengli' => "no"},
79 { 'name' => "debug",
80 'desc' => "{BasPlugout.debug}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes"}
[12330]84];
85
[17202]86my $options = { 'name' => "BasePlugout",
[12330]87 'desc' => "{BasPlugout.desc}",
88 'abstract' => "yes",
89 'inherits' => "no",
90 'args' => $arguments};
91
92sub new
93{
94 my $class = shift (@_);
95
96 my ($plugoutlist,$args,$hashArgOptLists) = @_;
97 push(@$plugoutlist, $class);
98
[20320]99 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
[12330]100
[17202]101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
[12330]103
104 my $self = {};
105 $self->{'plugout_type'} = $class;
106 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107 $self->{"info_only"} = 0;
108
109 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110 # the args, just return the object.
111 foreach my $strArg (@{$args})
112 {
113 if(defined $strArg && $strArg eq "-gsdlinfo")
114 {
115 $self->{"info_only"} = 1;
116 return bless $self, $class;
117 }
118 }
119
120 delete $self->{"info_only"};
121
[12546]122 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[12330]123 {
124 my $classTempClass = bless $self, $class;
[20320]125 print STDERR "<BadPlugout d=$plugout_name>\n";
126 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
[12330]127 $classTempClass->print_txt_usage(""); # Use default resource bundle
128 die "\n";
129 }
130
131
[12603]132 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
[12330]133 {
[20320]134 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135 if (!defined $full_file_path) {
136 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137 die "\n";
138 }
139 $self->{'xslt_file'} = $full_file_path;
[12330]140 }
141
142 $self->{'gs_count'} = 0;
143
[12459]144 $self->{'keep_import_structure'} = 0;
[12330]145
146 return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152 my $self = shift(@_);
153 my $header = shift(@_);
[12628]154 my $high_level_information_only = shift(@_);
[12330]155
156 # XML output is always in UTF-8
157 gsprintf::output_strings_in_UTF8;
158
159 if ($header) {
160 &PrintUsage::print_xml_header("plugout");
161 }
[12628]162 $self->print_xml($high_level_information_only);
[12330]163}
164
165
166sub print_xml
167{
168 my $self = shift(@_);
[12628]169 my $high_level_information_only = shift(@_);
170
[12330]171 my $optionlistref = $self->{'option_list'};
172 my @optionlist = @$optionlistref;
173 my $plugoutoptions = shift(@$optionlistref);
174 return if (!defined($plugoutoptions));
175
176 gsprintf(STDERR, "<PlugoutInfo>\n");
177 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
178 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179 $desc =~ s/</&amp;lt;/g; # doubly escaped
180 $desc =~ s/>/&amp;gt;/g;
181 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
[12628]184 unless (defined($high_level_information_only)) {
185 gsprintf(STDERR, " <Arguments>\n");
186 if (defined($plugoutoptions->{'args'})) {
187 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188 }
189 gsprintf(STDERR, " </Arguments>\n");
[12330]190
[12628]191 # Recurse up the plugout hierarchy
192 $self->print_xml();
[12330]193 }
194 gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200 my $self = shift(@_);
201
202 # Print the usage message for a plugout (recursively)
203 my $descoffset = $self->determine_description_offset(0);
204 $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209 my $self = shift(@_);
210 my $maxoffset = shift(@_);
211
212 my $optionlistref = $self->{'option_list'};
213 my @optionlist = @$optionlistref;
214 my $plugoutoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($plugoutoptions));
216
217 # Find the length of the longest option string of this download
218 my $plugoutargs = $plugoutoptions->{'args'};
219 if (defined($plugoutargs)) {
220 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the download hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235 my $self = shift(@_);
236 my $descoffset = shift(@_);
237 my $isleafclass = shift(@_);
238
239 my $optionlistref = $self->{'option_list'};
240 my @optionlist = @$optionlistref;
241 my $plugoutoptions = shift(@$optionlistref);
242 return if (!defined($plugoutoptions));
243
244 my $plugoutname = $plugoutoptions->{'name'};
245 my $plugoutargs = $plugoutoptions->{'args'};
246 my $plugoutdesc = $plugoutoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugoutdesc)) {
251 gsprintf(STDERR, "$plugoutdesc\n\n");
252 }
253 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254 }
255
256 # Display the download options, if there are some
257 if (defined($plugoutargs)) {
258 # Calculate the column offset of the option descriptions
259 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266 }
267
268 # Display the download options
269 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270 }
271
272 # Recurse up the download hierarchy
273 $self->print_plugout_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280 my ($strFunctionName,$strError) = @_;
281 {
[17202]282 print "Error occoured in BasePlugout.pm\n".
[12330]283 "In Function: ".$strFunctionName."\n".
284 "Error Message: ".$strError."\n";
285 exit(-1);
286 }
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291 my $self = shift (@_);
[12618]292 my ($type, $metadata) = @_;
[12330]293
[26536]294 if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) {
[12330]295 $self->{'OIDtype'} = $type;
296 } else {
297 $self->{'OIDtype'} = "hash";
298 }
[12618]299 if ($type =~ /^assigned$/) {
300 if (defined $metadata) {
301 $self->{'OIDmetadata'} = $metadata;
302 } else {
303 $self->{'OIDmetadata'} = "dc.Identifier";
304 }
305 }
[12330]306}
307
308sub set_output_dir
309{
310 my $self = shift @_;
311 my ($output_dir) = @_;
312
313 $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318 my $self = shift @_;
319 my ($output_dir) = @_;
320
321 $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326 my $self = shift (@_);
327
328 return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333 my $self = shift (@_);
334
335 return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340 my $self = shift (@_);
341
342 return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348 my $self = shift (@_);
349
350 my ($output_file_name) = @_;
351
352 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
353
354 return *OUTPUT;
355}
356
357sub release_output_handler
358{
359 my $self = shift (@_);
360 my ($outhandler) = @_;
361
362 close($outhandler);
363
364}
365
366sub output_xml_header {
367 my $self = shift (@_);
368 my ($handle,$docroot,$nondoctype) = @_;
369
[23824]370
371 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372
373 #For Dspace must be UTF in lower case
374 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
[12330]375
376 if (!defined $nondoctype){
[22818]377 my $doctype = (defined $docroot) ? $docroot : "Section";
378
379 # Used to be '<!DOCTYPE Archive SYSTEM ...'
380
381 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
[12330]382 }
383
384 print $handle "<$docroot>\n" if defined $docroot;
385}
386
387sub output_xml_footer {
388 my $self = shift (@_);
389 my ($handle,$docroot) = @_;
390 print $handle "</$docroot>\n" if defined $docroot;
391}
392
[23824]393
394sub output_general_xml_header
395{
396 my $self = shift (@_);
397 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
398
399 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
400
401 if (defined $opt_dtd) {
402 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
403 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
404 }
405
406 if (defined $docroot) {
407 my $full_docroot = $docroot;
408 if (defined $opt_attributes) {
409 $full_docroot .= " $opt_attributes";
410 }
411
412 print $handle "<$full_docroot>\n"
413 }
414}
415
416sub output_general_xml_footer
417{
418 output_xml_footer(@_);
419}
420
421
[12330]422sub process {
423 my $self = shift (@_);
424 my ($doc_obj) = @_;
[12459]425
[23939]426 # for OAI purposes
[12330]427 $doc_obj->set_lastmodified();
[23939]428 $doc_obj->set_oailastmodified();
[12330]429
430 if ($self->{'group_size'} > 1) {
[12459]431 $self->group_process ($doc_obj);
[12330]432 return;
433 }
434
435 my $OID = $doc_obj->get_OID();
436 $OID = "NULL" unless defined $OID;
437
438 my $top_section = $doc_obj->get_top_section();
439
440 #get document's directory
441 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
442
443 my $output_info = $self->{'output_info'};
444 return if (!defined $output_info);
445
446 ##############################
447 # call subclass' saveas method
448 ##############################
449 $self->saveas($doc_obj,$doc_dir);
[21564]450 $self->archiveinf_db($doc_obj,$doc_dir);
[17087]451
[12330]452}
453
[12363]454sub store_output_info_reference {
[12330]455 my $self = shift (@_);
456 my ($doc_obj) = @_;
457
[12363]458 my $output_info = $self->{'output_info'};
459 my $metaname = $self->{'sortmeta'};
460 if (!defined $metaname || $metaname !~ /\S/) {
461 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
462 return;
463 }
[12330]464
[12363]465 my $metadata = "";
466 my $top_section = $doc_obj->get_top_section();
467
468 my @commameta_list = split(/,/, $metaname);
469 foreach my $cmn (@commameta_list) {
470 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
471 if ($meta) {
472 # do remove prefix/suffix - this will apply to all values
473 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
474 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
475 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
476 $metadata .= $meta if ($meta);
477 }
[12330]478 }
479
480 # store reference in the output_info
481 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
482
483}
484
485sub group_process {
486
487 my $self = shift (@_);
488 my ($doc_obj) = @_;
489
490 my $OID = $doc_obj->get_OID();
491 $OID = "NULL" unless defined $OID;
492
493 my $groupsize = $self->{'group_size'};
494 my $gs_count = $self->{'gs_count'};
495 my $open_new_file = (($gs_count % $groupsize)==0);
496 my $outhandle = $self->{'output_handle'};
497
498 # opening a new file, or document has assoicated files => directory needed
[12459]499 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
500
[12330]501 # The directory the archive file (doc.xml) and all associated files
502 # should end up in
503 my $doc_dir;
504 # If we've determined its time for a new file, open it now
505 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
506 {
507 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
508 # only if opening new file
509 my $output_dir = $self->get_output_dir();
[12459]510 &util::mk_all_dir ($output_dir) unless -e $output_dir;
[12330]511 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
512 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
[12459]513
[12330]514 if ($gs_count>0)
515 {
516 return if (!$self->close_file_output());
517 }
518
[17202]519 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
[12330]520
521
522 $self->{'gs_filename'} = $doc_file;
[12459]523 $self->{'short_doc_file'} = $short_doc_file;
[12330]524 $self->{'gs_OID'} = $OID;
525 $self->{'gs_doc_dir'} = $doc_dir;
526
[17202]527 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
[12330]528 }
529 # Otherwise load the same archive document directory used last time
530 else
531 {
532 $doc_dir = $self->{'gs_doc_dir'};
533 }
534
535 # copy all the associated files, add this information as metadata
536 # to the document
[13172]537 print $outhandle "Writing associated files to $doc_dir\n";
[12330]538 $self->process_assoc_files ($doc_obj, $doc_dir);
[19494]539
540 # look up 'gsdlmetafile' metadata and store that information
541 # explicitly in $doc_obj
542 $self->process_metafiles_metadata ($doc_obj);
[12330]543 }
544
545 # save this document
[13172]546 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
547 print GROUPPROCESS $section_text;
[12330]548
549 $self->{'gs_count'}++;
550}
551
552
553sub saveas {
554 my $self = shift (@_);
555
556 die "Basplug::saveas function must be implemented in sub classes\n";
557}
558
559sub get_doc_dir {
560 my $self = shift (@_);
561 my ($OID, $source_filename) = @_;
562
563 my $working_dir = $self->get_output_dir();
[19775]564 my $working_info = $self->{'output_info'};
[12330]565 return if (!defined $working_info);
566
567 my $doc_info = $working_info->get_info($OID);
568 my $doc_dir = '';
569
[16252]570 if (defined $doc_info && scalar(@$doc_info) >= 1)
571 {
572 # This OID already has an archives directory, so use it again
[12330]573 $doc_dir = $doc_info->[0];
574 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
[16252]575 }
576 elsif ($self->{'keep_import_structure'})
577 {
[12330]578 $source_filename = &File::Basename::dirname($source_filename);
579 $source_filename =~ s/[\\\/]+/\//g;
580 $source_filename =~ s/\/$//;
581
582 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
[16252]583 }
[12330]584
[16252]585 # We have to use a new archives directory for this document
586 if ($doc_dir eq "")
587 {
588 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
[12330]589 }
590
[12603]591 if (!defined $self->{'group'} || !$self->{'group'}){
592 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
593 }
[16252]594
[12330]595 return $doc_dir;
596}
597
598sub get_new_doc_dir{
599 my $self = shift (@_);
600 my($working_info,$working_dir,$OID) = @_;
601
[19180]602
[12330]603 my $doc_dir = "";
604 my $doc_dir_rest = $OID;
[22232]605
[19180]606 # remove any \ and / from the OID
607 $doc_dir_rest =~ s/[\\\/]//g;
[22232]608
609 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
610 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
611
[12330]612 my $doc_dir_num = 0;
613
614 do {
615 $doc_dir .= "/" if $doc_dir_num > 0;
616 if ($doc_dir_rest =~ s/^(.{1,8})//) {
617 $doc_dir .= $1;
618 $doc_dir_num++;
619 }
620 } while ($doc_dir_rest ne "" &&
621 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
622 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
[17294]623 my $i = 1;
624 my $doc_dir_base = $doc_dir;
625 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
626 $doc_dir = "$doc_dir_base-$i";
627 $i++;
628 }
629
[16252]630 return "$doc_dir.dir";
[12330]631}
632
633sub process_assoc_files {
634 my $self = shift (@_);
635 my ($doc_obj, $doc_dir, $handle) = @_;
636
637 my $outhandle = $self->{'output_handle'};
638
639 my $output_dir = $self->get_output_dir();
640 return if (!defined $output_dir);
641
642 &util::mk_all_dir ($output_dir) unless -e $output_dir;
643
644 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
645 &util::mk_all_dir ($working_dir) unless -e $working_dir;
646
647 my @assoc_files = ();
648 my $filename;;
649
650 my $source_filename = $doc_obj->get_source_filename();
651
652 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
653
654 if (defined $collect_dir) {
655 my $dirsep_regexp = &util::get_os_dirsep();
656
657 if ($collect_dir !~ /$dirsep_regexp$/) {
658 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
659 }
660
661 # This test is never going to fail on Windows -- is this a problem?
662
663 if ($source_filename !~ /^$dirsep_regexp/) {
664 $source_filename = &util::filename_cat($collect_dir, $source_filename);
665 }
666 }
667
668
669 # set the assocfile path (even if we have no assoc files - need this for lucene)
670 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
671 "assocfilepath",
672 "$doc_dir");
673 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
674 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
675 $dir = "" unless defined $dir;
676
[23363]677 my $utf8_real_filename = $assoc_file_rec->[0];
678
[12330]679 # for some reasons the image associate file has / before the full path
[23363]680 $utf8_real_filename =~ s/^\\(.*)/$1/i;
681
[23387]682## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
683 my $real_filename = $utf8_real_filename;
684 $real_filename = &util::downgrade_if_dos_filename($real_filename);
[23363]685
[12330]686 if (-e $real_filename) {
687
688 $filename = &util::filename_cat($working_dir, $afile);
689
[18463]690 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
[12330]691
692 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
693 "gsdlassocfile",
694 "$afile:$assoc_file_rec->[2]:$dir");
[23363]695 } elsif ($self->{'verbosity'} > 1) {
[17202]696 print $outhandle "BasePlugout::process couldn't copy the associated file " .
[12330]697 "$real_filename to $afile\n";
698 }
699 }
700}
701
[17087]702
[19494]703sub process_metafiles_metadata
704{
705 my $self = shift (@_);
706 my ($doc_obj) = @_;
707
708 my $top_section = $doc_obj->get_top_section();
709 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
710
711 foreach my $metafile_pair (@$metafiles) {
[19516]712 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
[19494]713
714 $doc_obj->metadata_file($full_metafile,$metafile);
715 }
716
717 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
718}
719
720sub archiveinf_files_to_field
721{
722 my $self = shift(@_);
723 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
724
725 foreach my $file_rec (@$files) {
[20777]726 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
727 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
[19494]728 # for some reasons the image associate file has / before the full path
729 $real_filename =~ s/^\\(.*)/$1/i;
730
[23387]731 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
[23363]732
733 if (-e $raw_filename) {
734
[20763]735# if (defined $collect_dir) {
736# my $collect_dir_re_safe = $collect_dir;
[24829]737# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
[20763]738# $collect_dir_re_safe =~ s/\./\\./g;##
[19494]739
[20763]740# $real_filename =~ s/^$collect_dir_re_safe//;
741# }
[22328]742
[20801]743 if (defined $reverse_lookups) {
744 $reverse_lookups->{$real_filename} = 1;
745 }
[23363]746### push(@{$oid_files->{$field}},$full_file);
747 push(@{$oid_files->{$field}},$raw_filename);
[19494]748 }
749 else {
[19516]750 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
[19494]751 }
752 }
753}
754
[21564]755sub archiveinf_db
[17087]756{
757 my $self = shift (@_);
758 my ($doc_obj) = @_;
759
760 my $verbosity = $self->{'verbosity'};
761
762 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
763 if (defined $collect_dir) {
764 my $dirsep_regexp = &util::get_os_dirsep();
765
766 if ($collect_dir !~ /$dirsep_regexp$/) {
767 # ensure there is a slash at the end
768 $collect_dir .= &util::get_dirsep();
769 }
770 }
771
772 my $oid = $doc_obj->get_OID();
[19829]773 my $source_filename = $doc_obj->get_unmodified_source_filename();
[18441]774 my $working_info = $self->{'output_info'};
775 my $doc_info = $working_info->get_info($oid);
[20651]776
[20747]777 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
[22328]778 # doc_file is the path to the archive doc.xml. Make sure it has unix
779 # slashes, then if the collection is copied to linux, it can be built without reimport
780 $doc_file =~ s/\\/\//g;
[18441]781 my $oid_files = { 'doc-file' => $doc_file,
782 'index-status' => $index_status,
783 'src-file' => $source_filename,
[20747]784 'sort-meta' => $sortmeta,
[19775]785 'assoc-file' => [],
786 'meta-file' => [] };
[17087]787
[19494]788 my $reverse_lookups = { $source_filename => "1" };
[17087]789
790
[20777]791 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
[19494]792 $collect_dir,$oid_files,$reverse_lookups);
[17087]793
[17120]794
[19775]795 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
[20801]796 $collect_dir,$oid_files);
[17087]797
[21584]798 # Get the infodbtype value for this collection from the arcinfo object
799 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
[17087]800 my $output_dir = $self->{'output_dir'};
801
[21584]802 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
803 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
[17087]804
[18528]805 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
[17087]806
[24958]807 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
808 my $top_section = $doc_obj->get_top_section();
809
810 my $dc_titles = $doc_obj->get_metadata($top_section,"dls.Title");
811 my $dc_title = join("; ", @$dc_titles);
812
813 if ($oid_files->{'index-status'} eq "R") {
814 $dc_title .= " (Updated)";
815 }
816
817 my $rss_filename = &util::filename_cat($output_dir,"rss-items.rdf");
818 if (open(RSSOUT,">>$rss_filename")) {
819 print RSSOUT "<item>\n";
820 print RSSOUT " <title>$dc_title</title>\n";
821 print RSSOUT " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
822 print RSSOUT "</item>\n";
823 close(RSSOUT);
824 }
825 else {
826 print STDERR "**** Failed to open $rss_filename\n!$\n";
827 }
828
829
830 }
831
[19775]832 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
833 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
834 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
[20747]835 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
[19775]836
[21584]837 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
838 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
839 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
[19775]840
[19494]841 foreach my $rl (keys %$reverse_lookups) {
[19775]842 $working_info->add_reverseinfo($rl,$oid);
[20801]843 }
844
845 # meta files not set in reverese entry, but need to set the metadata flag
[20814]846 if (defined $doc_obj->get_meta_files()) {
847 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
848 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
849 $working_info->set_meta_file_flag($full_file);
850 }
[20801]851 }
[17087]852}
853
854
[12330]855sub set_sortmeta {
856 my $self = shift (@_);
857 my ($sortmeta, $removeprefix, $removesuffix) = @_;
858
859 $self->{'sortmeta'} = $sortmeta;
860 if (defined ($removeprefix) && $removeprefix ) {
861 $removeprefix =~ s/^\^//; # don't need a leading ^
862 $self->{'removeprefix'} = $removeprefix;
863 }
864 if (defined ($removesuffix) && $removesuffix) {
865 $removesuffix =~ s/\$$//; # don't need a trailing $
866 $self->{'removesuffix'} = $removesuffix;
867 }
868}
869
870sub open_xslt_pipe
871{
872 my $self = shift @_;
873 my ($output_file_name, $xslt_file)=@_;
874
[20320]875 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
876
[20874]877 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
[13013]878
[15140]879 my $mapping_file_path = "";
880
[13225]881 if ($ENV{'GSDLOS'} eq "windows"){
882 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
[20320]883 # this file:/// bit didn't work for me on windows XP
884 #$xslt_file = "\"file:///".$xslt_file."\"";
885 #$mapping_file_path = "\"file:///";
[13225]886 }
887 else{
888 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
889 }
890
891
[13013]892 $java_class_path = "\"".$java_class_path."\"";
[13024]893
[20902]894 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
[12330]895
[12603]896 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
[13064]897 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
[14969]898 $cmd .= "-m $mapping_file_path";
[12603]899 }
[20320]900
[12330]901 open(*XMLWRITER, $cmd)
902 or die "can't open pipe to xslt: $!";
903
904
905 $self->{'xslt_writer'} = *XMLWRITER;
906
907 print XMLWRITER "<?DocStart?>\n";
908 print XMLWRITER "$output_file_name\n";
[14969]909
[12330]910
911 }
912
913
914sub close_xslt_pipe
915{
916 my $self = shift @_;
917
918
919 return unless defined $self->{'xslt_writer'} ;
920
921 my $xsltwriter = $self->{'xslt_writer'};
922
923 print $xsltwriter "<?DocEnd?>\n";
924 close($xsltwriter);
[13024]925
926 undef $self->{'xslt_writer'};
927
[12330]928}
929
930sub close_file_output
931{
932 my ($self) = @_;
933
934 # make sure that the handle has been opened - it won't be if we failed
935 # to import any documents...
936 if (defined(fileno(GROUPPROCESS))) {
937 $self->output_xml_footer('GROUPPROCESS','Archive');
938 close GROUPPROCESS;
939 }
940
941 my $OID = $self->{'gs_OID'};
942 my $short_doc_file = $self->{'short_doc_file'};
943
944 if ($self->{'gzip'}) {
945 my $doc_file = $self->{'gs_filename'};
946 `gzip $doc_file`;
947 $doc_file .= ".gz";
948 $short_doc_file .= ".gz";
949 if (!-e $doc_file) {
950 my $outhandle = $self->{'output_handle'};
951 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
952 return 0;
953 }
954 }
955
956 # store reference in output_info
957 my $output_info = $self->{'output_info'};
958 return 0 if (!defined $output_info);
959 $output_info->add_info($OID, $short_doc_file, undef, undef);
960 return 1;
961}
962
[13172]963
[12330]964#the subclass should implement this method if is_group method could return 1.
965sub close_group_output{
966 my $self = shift (@_);
967}
968
969sub is_group {
970 my $self = shift (@_);
971 return 0;
972}
973
[13172]974my $dc_set = { Title => 1,
975 Creator => 1,
976 Subject => 1,
977 Description => 1,
978 Publisher => 1,
979 Contributor => 1,
980 Date => 1,
981 Type => 1,
982 Format => 1,
983 Identifier => 1,
984 Source => 1,
985 Language => 1,
986 Relation => 1,
987 Coverage => 1,
988 Rights => 1};
989
990
991# returns an XML representation of the dublin core metadata
[24404]992# if dc meta is not found, try ex meta
993# This method is not used by the DSpacePlugout, which has its
994# own method to save its dc metadata
[13172]995sub get_dc_metadata {
996 my $self = shift(@_);
997 my ($doc_obj, $section, $version) = @_;
998
999 # build up string of dublin core metadata
1000 $section="" unless defined $section;
1001
1002 my $section_ptr = $doc_obj->_lookup_section($section);
1003 return "" unless defined $section_ptr;
1004
1005
1006 my $explicit_dc = {};
[24404]1007 my $explicit_ex_dc = {};
[13172]1008 my $explicit_ex = {};
1009
1010 my $all_text="";
[24404]1011
1012 # We want high quality dc metadata to go in first, so we store all the
1013 # assigned dc.* values first. Then, for all those dc metadata names in
1014 # the official dc set that are as yet unassigned, we look to see whether
1015 # embedded ex.dc.* metadata has defined some values for them. If not,
1016 # then for the same missing dc metadata names, we look in ex metadata.
1017
[13172]1018 foreach my $data (@{$section_ptr->{'metadata'}}){
1019 my $escaped_value = &docprint::escape_text($data->[1]);
1020 if ($data->[0]=~ m/^dc\./) {
1021 $data->[0] =~ tr/[A-Z]/[a-z]/;
1022
1023 $data->[0] =~ m/^dc\.(.*)/;
1024 my $dc_element = $1;
1025
1026 if (!defined $explicit_dc->{$dc_element}) {
1027 $explicit_dc->{$dc_element} = [];
1028 }
1029 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1030
1031 if (defined $version && ($version eq "oai_dc")) {
1032 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1033 }
1034 else {
1035 # qualifier???
1036 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1037 }
1038
[24404]1039 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1040 $data->[0] =~ m/^ex\.dc\.(.*)/;
1041 my $ex_dc_element = $1;
1042 my $lc_ex_dc_element = lc($ex_dc_element);
1043
1044 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1045 if (defined $dc_set->{$ex_dc_element}) {
1046 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1047 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1048 }
1049 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1050 }
1051 }
1052 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
[13172]1053 $data->[0] =~ m/^(ex\.)?(.*)/;
[24404]1054 my $ex_element = $2;
[13172]1055 my $lc_ex_element = lc($ex_element);
1056
1057 if (defined $dc_set->{$ex_element}) {
1058 if (!defined $explicit_ex->{$lc_ex_element}) {
1059 $explicit_ex->{$lc_ex_element} = [];
1060 }
1061 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1062 }
1063 }
1064 }
1065
1066 # go through dc_set and for any element *not* defined in explicit_dc
[14932]1067 # that does exist in explicit_ex, add it in as metadata
[13172]1068 foreach my $k ( keys %$dc_set ) {
1069 my $lc_k = lc($k);
1070
1071 if (!defined $explicit_dc->{$lc_k}) {
[24404]1072 # try to find if ex.dc.* defines this dc.* meta,
1073 # if not, then look for whether there's an ex.* equivalent
[13172]1074
[24404]1075 if (defined $explicit_ex_dc->{$lc_k}) {
1076 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1077 my $dc_element = $lc_k;
1078 my $escaped_value = $v;
1079
1080 if (defined $version && ($version eq "oai_dc")) {
1081 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1082 }
1083 else {
1084 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1085 }
1086 }
1087 } elsif (defined $explicit_ex->{$lc_k}) {
[13172]1088 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1089 my $dc_element = $lc_k;
1090 my $escaped_value = $v;
1091
1092 if (defined $version && ($version eq "oai_dc")) {
1093 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1094 }
1095 else {
1096 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1097 }
1098 }
1099 }
1100 }
1101 }
1102
1103 if ($all_text eq "") {
1104 $all_text .= " There is no Dublin Core metatdata in this document\n";
1105 }
1106 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1107
1108 return $all_text;
1109}
1110
1111# Build up dublin_core metadata. Priority given to dc.* over ex.*
1112# This method was apparently added by Jeffrey and committed by Shaoqun.
1113# But we don't know why it was added, so not using it anymore.
1114sub new_get_dc_metadata {
1115
1116 my $self = shift(@_);
1117 my ($doc_obj, $section, $version) = @_;
1118
1119 # build up string of dublin core metadata
1120 $section="" unless defined $section;
1121
1122 my $section_ptr=$doc_obj->_lookup_section($section);
1123 return "" unless defined $section_ptr;
1124
1125 my $all_text = "";
1126 foreach my $data (@{$section_ptr->{'metadata'}}){
1127 my $escaped_value = &docprint::escape_text($data->[1]);
1128 my $dc_element = $data->[0];
1129
1130 my @array = split('\.',$dc_element);
1131 my ($type,$name);
1132
1133 if(defined $array[1])
1134 {
1135 $type = $array[0];
1136 $name = $array[1];
1137 }
1138 else
1139 {
1140 $type = "ex";
1141 $name = $array[0];
1142 }
1143
1144 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1145 }
1146 return $all_text;
1147}
1148
1149
[12330]11501;
Note: See TracBrowser for help on using the repository browser.