source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27513

Last change on this file since 27513 was 27513, checked in by jmt12, 11 years ago

Restoring the original logic around working_info (although still not sure what this does), removing limit of 256 on incremental directory generation (as Kathy points out some MARC collections have thousands of documents all with the same HASH), and had a good go at commenting the function especially the new stuff I've added

  • Property svn:keywords set to Author Date Id Revision
File size: 35.0 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "no",
96 'hiddengli' => "no"}
97];
98
99my $options = { 'name' => "BasePlugout",
100 'desc' => "{BasPlugout.desc}",
101 'abstract' => "yes",
102 'inherits' => "no",
103 'args' => $arguments};
104
105sub new
106{
107 my $class = shift (@_);
108
109 my ($plugoutlist,$args,$hashArgOptLists) = @_;
110 push(@$plugoutlist, $class);
111
112 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
113
114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
116
117 my $self = {};
118 $self->{'plugout_type'} = $class;
119 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
120 $self->{"info_only"} = 0;
121
122 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
123 # the args, just return the object.
124 foreach my $strArg (@{$args})
125 {
126 if(defined $strArg && $strArg eq "-gsdlinfo")
127 {
128 $self->{"info_only"} = 1;
129 return bless $self, $class;
130 }
131 }
132
133 delete $self->{"info_only"};
134
135 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
136 {
137 my $classTempClass = bless $self, $class;
138 print STDERR "<BadPlugout d=$plugout_name>\n";
139 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
140 $classTempClass->print_txt_usage(""); # Use default resource bundle
141 die "\n";
142 }
143
144
145 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
146 {
147 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
148 if (!defined $full_file_path) {
149 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
150 die "\n";
151 }
152 $self->{'xslt_file'} = $full_file_path;
153 }
154
155 $self->{'gs_count'} = 0;
156
157 $self->{'keep_import_structure'} = 0;
158
159 return bless $self, $class;
160
161}
162
163sub print_xml_usage
164{
165 my $self = shift(@_);
166 my $header = shift(@_);
167 my $high_level_information_only = shift(@_);
168
169 # XML output is always in UTF-8
170 gsprintf::output_strings_in_UTF8;
171
172 if ($header) {
173 &PrintUsage::print_xml_header("plugout");
174 }
175 $self->print_xml($high_level_information_only);
176}
177
178
179sub print_xml
180{
181 my $self = shift(@_);
182 my $high_level_information_only = shift(@_);
183
184 my $optionlistref = $self->{'option_list'};
185 my @optionlist = @$optionlistref;
186 my $plugoutoptions = shift(@$optionlistref);
187 return if (!defined($plugoutoptions));
188
189 gsprintf(STDERR, "<PlugoutInfo>\n");
190 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
191 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
192 $desc =~ s/</&amp;lt;/g; # doubly escaped
193 $desc =~ s/>/&amp;gt;/g;
194 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
195 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
196 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
197 unless (defined($high_level_information_only)) {
198 gsprintf(STDERR, " <Arguments>\n");
199 if (defined($plugoutoptions->{'args'})) {
200 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
201 }
202 gsprintf(STDERR, " </Arguments>\n");
203
204 # Recurse up the plugout hierarchy
205 $self->print_xml();
206 }
207 gsprintf(STDERR, "</PlugoutInfo>\n");
208}
209
210
211sub print_txt_usage
212{
213 my $self = shift(@_);
214
215 # Print the usage message for a plugout (recursively)
216 my $descoffset = $self->determine_description_offset(0);
217 $self->print_plugout_usage($descoffset, 1);
218}
219
220sub determine_description_offset
221{
222 my $self = shift(@_);
223 my $maxoffset = shift(@_);
224
225 my $optionlistref = $self->{'option_list'};
226 my @optionlist = @$optionlistref;
227 my $plugoutoptions = pop(@$optionlistref);
228 return $maxoffset if (!defined($plugoutoptions));
229
230 # Find the length of the longest option string of this download
231 my $plugoutargs = $plugoutoptions->{'args'};
232 if (defined($plugoutargs)) {
233 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
234 if ($longest > $maxoffset) {
235 $maxoffset = $longest;
236 }
237 }
238
239 # Recurse up the download hierarchy
240 $maxoffset = $self->determine_description_offset($maxoffset);
241 $self->{'option_list'} = \@optionlist;
242 return $maxoffset;
243}
244
245
246sub print_plugout_usage
247{
248 my $self = shift(@_);
249 my $descoffset = shift(@_);
250 my $isleafclass = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $plugoutoptions = shift(@$optionlistref);
255 return if (!defined($plugoutoptions));
256
257 my $plugoutname = $plugoutoptions->{'name'};
258 my $plugoutargs = $plugoutoptions->{'args'};
259 my $plugoutdesc = $plugoutoptions->{'desc'};
260
261 # Produce the usage information using the data structure above
262 if ($isleafclass) {
263 if (defined($plugoutdesc)) {
264 gsprintf(STDERR, "$plugoutdesc\n\n");
265 }
266 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
267 }
268
269 # Display the download options, if there are some
270 if (defined($plugoutargs)) {
271 # Calculate the column offset of the option descriptions
272 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
273
274 if ($isleafclass) {
275 gsprintf(STDERR, " {common.specific_options}:\n");
276 }
277 else {
278 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
279 }
280
281 # Display the download options
282 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
283 }
284
285 # Recurse up the download hierarchy
286 $self->print_plugout_usage($descoffset, 0);
287 $self->{'option_list'} = \@optionlist;
288}
289
290
291sub error
292{
293 my ($strFunctionName,$strError) = @_;
294 {
295 print "Error occoured in BasePlugout.pm\n".
296 "In Function: ".$strFunctionName."\n".
297 "Error Message: ".$strError."\n";
298 exit(-1);
299 }
300}
301
302# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
303sub set_OIDtype {
304 my $self = shift (@_);
305 my ($type, $metadata) = @_;
306
307 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
308 $self->{'OIDtype'} = $type;
309 } else {
310 $self->{'OIDtype'} = "hash";
311 }
312 if ($type =~ /^assigned$/) {
313 if (defined $metadata) {
314 $self->{'OIDmetadata'} = $metadata;
315 } else {
316 $self->{'OIDmetadata'} = "dc.Identifier";
317 }
318 }
319}
320
321sub set_output_dir
322{
323 my $self = shift @_;
324 my ($output_dir) = @_;
325
326 $self->{'output_dir'} = $output_dir;
327}
328
329sub setoutputdir
330{
331 my $self = shift @_;
332 my ($output_dir) = @_;
333
334 $self->{'output_dir'} = $output_dir;
335}
336
337sub get_output_dir
338{
339 my $self = shift (@_);
340
341 return $self->{'output_dir'};
342}
343
344sub getoutputdir
345{
346 my $self = shift (@_);
347
348 return $self->{'output_dir'};
349}
350
351sub getoutputinfo
352{
353 my $self = shift (@_);
354
355 return $self->{'output_info'};
356}
357
358
359sub get_output_handler
360{
361 my $self = shift (@_);
362
363 my ($output_file_name) = @_;
364
365 my $fh;
366 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
367
368 return $fh;
369}
370
371sub release_output_handler
372{
373 my $self = shift (@_);
374 my ($outhandler) = @_;
375
376 close($outhandler);
377
378}
379
380sub output_xml_header {
381 my $self = shift (@_);
382 my ($handle,$docroot,$nondoctype) = @_;
383
384
385 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
386
387 #For Dspace must be UTF in lower case
388 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
389
390 if (!defined $nondoctype){
391 my $doctype = (defined $docroot) ? $docroot : "Section";
392
393 # Used to be '<!DOCTYPE Archive SYSTEM ...'
394
395 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
396 }
397
398 print $handle "<$docroot>\n" if defined $docroot;
399}
400
401sub output_xml_footer {
402 my $self = shift (@_);
403 my ($handle,$docroot) = @_;
404 print $handle "</$docroot>\n" if defined $docroot;
405}
406
407
408sub output_general_xml_header
409{
410 my $self = shift (@_);
411 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
412
413 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
414
415 if (defined $opt_dtd) {
416 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
417 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
418 }
419
420 if (defined $docroot) {
421 my $full_docroot = $docroot;
422 if (defined $opt_attributes) {
423 $full_docroot .= " $opt_attributes";
424 }
425
426 print $handle "<$full_docroot>\n"
427 }
428}
429
430sub output_general_xml_footer
431{
432 output_xml_footer(@_);
433}
434
435
436sub process {
437 my $self = shift (@_);
438 my ($doc_obj) = @_;
439
440 # for OAI purposes
441 $doc_obj->set_lastmodified();
442 $doc_obj->set_oailastmodified();
443
444 if ($self->{'group_size'} > 1) {
445 $self->group_process ($doc_obj);
446 return;
447 }
448
449 my $OID = $doc_obj->get_OID();
450 $OID = "NULL" unless defined $OID;
451
452 my $top_section = $doc_obj->get_top_section();
453
454 #get document's directory
455 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
456
457 my $output_info = $self->{'output_info'};
458 return if (!defined $output_info);
459
460 ##############################
461 # call subclass' saveas method
462 ##############################
463 $self->saveas($doc_obj,$doc_dir);
464 $self->archiveinf_db($doc_obj,$doc_dir);
465
466}
467
468sub store_output_info_reference {
469 my $self = shift (@_);
470 my ($doc_obj) = @_;
471
472 my $output_info = $self->{'output_info'};
473 my $metaname = $self->{'sortmeta'};
474 if (!defined $metaname || $metaname !~ /\S/) {
475 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
476 return;
477 }
478
479 my $metadata = "";
480 my $top_section = $doc_obj->get_top_section();
481
482 my @commameta_list = split(/,/, $metaname);
483 foreach my $cmn (@commameta_list) {
484 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
485 if ($meta) {
486 # do remove prefix/suffix - this will apply to all values
487 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
488 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
489 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
490 $metadata .= $meta if ($meta);
491 }
492 }
493
494 # store reference in the output_info
495 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
496
497}
498
499sub group_process {
500
501 my $self = shift (@_);
502 my ($doc_obj) = @_;
503
504 my $OID = $doc_obj->get_OID();
505 $OID = "NULL" unless defined $OID;
506
507 my $groupsize = $self->{'group_size'};
508 my $gs_count = $self->{'gs_count'};
509 my $open_new_file = (($gs_count % $groupsize)==0);
510 my $outhandle = $self->{'output_handle'};
511
512 # opening a new file, or document has assoicated files => directory needed
513 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
514
515 # The directory the archive file (doc.xml) and all associated files
516 # should end up in
517 my $doc_dir;
518 # If we've determined its time for a new file, open it now
519 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
520 {
521 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
522 # only if opening new file
523 my $output_dir = $self->get_output_dir();
524 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
525 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
526 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
527
528 if ($gs_count>0)
529 {
530 return if (!$self->close_file_output());
531 }
532
533 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
534
535
536 $self->{'gs_filename'} = $doc_file;
537 $self->{'short_doc_file'} = $short_doc_file;
538 $self->{'gs_OID'} = $OID;
539 $self->{'gs_doc_dir'} = $doc_dir;
540
541 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
542 }
543 # Otherwise load the same archive document directory used last time
544 else
545 {
546 $doc_dir = $self->{'gs_doc_dir'};
547 }
548
549 # copy all the associated files, add this information as metadata
550 # to the document
551 print $outhandle "Writing associated files to $doc_dir\n";
552 $self->process_assoc_files ($doc_obj, $doc_dir);
553
554 # look up 'gsdlmetafile' metadata and store that information
555 # explicitly in $doc_obj
556 $self->process_metafiles_metadata ($doc_obj);
557 }
558
559 # save this document
560 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
561 print GROUPPROCESS $section_text;
562
563 $self->{'gs_count'}++;
564}
565
566
567sub saveas {
568 my $self = shift (@_);
569
570 die "Basplug::saveas function must be implemented in sub classes\n";
571}
572
573sub get_doc_dir {
574 my $self = shift (@_);
575 my ($OID, $source_filename) = @_;
576
577 my $working_dir = $self->get_output_dir();
578 my $working_info = $self->{'output_info'};
579 return if (!defined $working_info);
580
581 my $doc_info = $working_info->get_info($OID);
582 my $doc_dir = '';
583
584 if (defined $doc_info && scalar(@$doc_info) >= 1)
585 {
586 # This OID already has an archives directory, so use it again
587 $doc_dir = $doc_info->[0];
588 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
589 }
590 elsif ($self->{'keep_import_structure'})
591 {
592 $source_filename = &File::Basename::dirname($source_filename);
593 $source_filename =~ s/[\\\/]+/\//g;
594 $source_filename =~ s/\/$//;
595
596 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
597 }
598
599 # We have to use a new archives directory for this document
600 if ($doc_dir eq "")
601 {
602 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
603 }
604
605 if (!defined $self->{'group'} || !$self->{'group'}){
606 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
607 }
608
609 return $doc_dir;
610}
611
612## @function get_new_doc_dir()
613#
614# Once a doc object is ready to write to disk (and hence has a nice OID),
615# generate a unique subdirectory to write the information to.
616# - create the directory as part of this call, to try and avoid race conditions
617# found in parallel processing [jmt12]
618#
619sub get_new_doc_dir
620{
621 my $self = shift (@_);
622 my($working_info,$working_dir,$OID) = @_;
623
624 my $doc_dir = "";
625 my $doc_dir_rest = $OID;
626
627 # remove any \ and / from the OID
628 $doc_dir_rest =~ s/[\\\/]//g;
629
630 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
631 if ($ENV{'GSDLOS'} =~ /^windows$/i)
632 {
633 $doc_dir_rest =~ s/\://g;
634 }
635
636 # we generally create a unique directory by adding consequtive fragments of
637 # the document identifier (split by some predefined length - defaulting to
638 # 8) until we find a directory that doesn't yet exist. Note that directories
639 # that contain a document have a suffix ".dir" (whereas those that contain
640 # only subdirectories have no suffix).
641 my $doc_dir_num = 0; # how many directories deep we are
642 my $created_directory = 0; # have we successfully created a new directory
643 do
644 {
645 # (does this work on windows? - jmt12)
646 if ($doc_dir_num > 0)
647 {
648 $doc_dir .= '/';
649 }
650 # the default matching pattern grabs the next 'subdir_split_length'
651 # characters of the OID to act as the next subdirectory
652 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
653 # Do we count any "HASH" prefix against the split length limit?
654 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
655 {
656 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
657 }
658 # Note the use of 's' to both capture the next chuck of OID and to remove
659 # it from OID at the same time
660 if ($doc_dir_rest =~ s/$pattern//i)
661 {
662 $doc_dir .= $1;
663 $doc_dir_num++;
664 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
665 }
666 my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
667 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
668 ###rint STDERR " - create directory: $new_dir => $created_directory\n";
669 ###rint STDERR " - rest: $doc_dir_rest\n";
670 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
671 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
672 }
673 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
674
675 # not unique yet? Add on an incremental suffix until we are unique
676 my $i = 1;
677 my $doc_dir_base = $doc_dir;
678 while ($created_directory == 0)
679 {
680 $doc_dir = $doc_dir_base . '-' . $i;
681 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
682 $i++;
683 }
684
685 # in theory this should never happen
686 if (!$created_directory)
687 {
688 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
689 }
690
691 return $doc_dir . '.dir';
692}
693## get_new_doc_dir()
694
695
696sub process_assoc_files {
697 my $self = shift (@_);
698 my ($doc_obj, $doc_dir, $handle) = @_;
699
700 my $outhandle = $self->{'output_handle'};
701
702 my $output_dir = $self->get_output_dir();
703 return if (!defined $output_dir);
704
705 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
706
707 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
708 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
709
710 my @assoc_files = ();
711 my $filename;;
712
713 my $source_filename = $doc_obj->get_source_filename();
714
715 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
716
717 if (defined $collect_dir) {
718 my $dirsep_regexp = &util::get_os_dirsep();
719
720 if ($collect_dir !~ /$dirsep_regexp$/) {
721 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
722 }
723
724 # This test is never going to fail on Windows -- is this a problem?
725
726 if ($source_filename !~ /^$dirsep_regexp/) {
727 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
728 }
729 }
730
731
732 # set the assocfile path (even if we have no assoc files - need this for lucene)
733 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
734 "assocfilepath",
735 "$doc_dir");
736 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
737 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
738 $dir = "" unless defined $dir;
739
740 my $utf8_real_filename = $assoc_file_rec->[0];
741
742 # for some reasons the image associate file has / before the full path
743 $utf8_real_filename =~ s/^\\(.*)/$1/i;
744
745## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
746 my $real_filename = $utf8_real_filename;
747 $real_filename = &util::downgrade_if_dos_filename($real_filename);
748
749 if (&FileUtils::fileExists($real_filename)) {
750
751 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
752
753 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
754
755 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
756 "gsdlassocfile",
757 "$afile:$assoc_file_rec->[2]:$dir");
758 } elsif ($self->{'verbosity'} > 1) {
759 print $outhandle "BasePlugout::process couldn't copy the associated file " .
760 "$real_filename to $afile\n";
761 }
762 }
763}
764
765
766sub process_metafiles_metadata
767{
768 my $self = shift (@_);
769 my ($doc_obj) = @_;
770
771 my $top_section = $doc_obj->get_top_section();
772 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
773
774 foreach my $metafile_pair (@$metafiles) {
775 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
776
777 $doc_obj->metadata_file($full_metafile,$metafile);
778 }
779
780 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
781}
782
783sub archiveinf_files_to_field
784{
785 my $self = shift(@_);
786 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
787
788 foreach my $file_rec (@$files) {
789 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
790 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
791 # for some reasons the image associate file has / before the full path
792 $real_filename =~ s/^\\(.*)/$1/i;
793
794 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
795
796 if (&FileUtils::fileExists($raw_filename)) {
797
798# if (defined $collect_dir) {
799# my $collect_dir_re_safe = $collect_dir;
800# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
801# $collect_dir_re_safe =~ s/\./\\./g;##
802
803# $real_filename =~ s/^$collect_dir_re_safe//;
804# }
805
806 if (defined $reverse_lookups) {
807 $reverse_lookups->{$real_filename} = 1;
808 }
809### push(@{$oid_files->{$field}},$full_file);
810 push(@{$oid_files->{$field}},$raw_filename);
811 }
812 else {
813 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
814 }
815 }
816}
817
818sub archiveinf_db
819{
820 my $self = shift (@_);
821 my ($doc_obj) = @_;
822
823 my $verbosity = $self->{'verbosity'};
824
825 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
826 if (defined $collect_dir) {
827 my $dirsep_regexp = &util::get_os_dirsep();
828
829 if ($collect_dir !~ /$dirsep_regexp$/) {
830 # ensure there is a slash at the end
831 $collect_dir .= &util::get_dirsep();
832 }
833 }
834
835 my $oid = $doc_obj->get_OID();
836 my $source_filename = $doc_obj->get_unmodified_source_filename();
837 my $working_info = $self->{'output_info'};
838 my $doc_info = $working_info->get_info($oid);
839
840 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
841 # doc_file is the path to the archive doc.xml. Make sure it has unix
842 # slashes, then if the collection is copied to linux, it can be built without reimport
843 $doc_file =~ s/\\/\//g;
844 my $oid_files = { 'doc-file' => $doc_file,
845 'index-status' => $index_status,
846 'src-file' => $source_filename,
847 'sort-meta' => $sortmeta,
848 'assoc-file' => [],
849 'meta-file' => [] };
850
851 my $reverse_lookups = { $source_filename => "1" };
852
853
854 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
855 $collect_dir,$oid_files,$reverse_lookups);
856
857
858 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
859 $collect_dir,$oid_files);
860
861 # Get the infodbtype value for this collection from the arcinfo object
862 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
863 my $output_dir = $self->{'output_dir'};
864
865 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
866 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
867
868 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
869
870 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
871 my $top_section = $doc_obj->get_top_section();
872
873 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
874 my $dc_title = join("; ", @$dc_titles);
875
876 if ($oid_files->{'index-status'} eq "R") {
877 $dc_title .= " (Updated)";
878 }
879
880 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
881 my $rss_fh;
882 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh)) {
883 print $rss_fh "<item>\n";
884 print $rss_fh " <title>$dc_title</title>\n";
885 print $rss_fh " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
886 print $rss_fh "</item>\n";
887 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
888 }
889 else {
890 print STDERR "**** Failed to open $rss_filename\n!$\n";
891 }
892
893
894 }
895
896 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
897 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
898 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
899 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
900
901 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
902 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
903 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
904
905 foreach my $rl (keys %$reverse_lookups) {
906 $working_info->add_reverseinfo($rl,$oid);
907 }
908
909 # meta files not set in reverese entry, but need to set the metadata flag
910 if (defined $doc_obj->get_meta_files()) {
911 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
912 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
913 $working_info->set_meta_file_flag($full_file);
914 }
915 }
916}
917
918
919sub set_sortmeta {
920 my $self = shift (@_);
921 my ($sortmeta, $removeprefix, $removesuffix) = @_;
922
923 $self->{'sortmeta'} = $sortmeta;
924 if (defined ($removeprefix) && $removeprefix ) {
925 $removeprefix =~ s/^\^//; # don't need a leading ^
926 $self->{'removeprefix'} = $removeprefix;
927 }
928 if (defined ($removesuffix) && $removesuffix) {
929 $removesuffix =~ s/\$$//; # don't need a trailing $
930 $self->{'removesuffix'} = $removesuffix;
931 }
932}
933
934sub open_xslt_pipe
935{
936 my $self = shift @_;
937 my ($output_file_name, $xslt_file)=@_;
938
939 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
940
941 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
942
943 my $mapping_file_path = "";
944
945 if ($ENV{'GSDLOS'} eq "windows"){
946 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
947 # this file:/// bit didn't work for me on windows XP
948 #$xslt_file = "\"file:///".$xslt_file."\"";
949 #$mapping_file_path = "\"file:///";
950 }
951 else{
952 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
953 }
954
955
956 $java_class_path = "\"".$java_class_path."\"";
957
958 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
959
960 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
961 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
962 $cmd .= "-m $mapping_file_path";
963 }
964
965 open(*XMLWRITER, $cmd)
966 or die "can't open pipe to xslt: $!";
967
968
969 $self->{'xslt_writer'} = *XMLWRITER;
970
971 print XMLWRITER "<?DocStart?>\n";
972 print XMLWRITER "$output_file_name\n";
973
974
975 }
976
977
978sub close_xslt_pipe
979{
980 my $self = shift @_;
981
982
983 return unless defined $self->{'xslt_writer'} ;
984
985 my $xsltwriter = $self->{'xslt_writer'};
986
987 print $xsltwriter "<?DocEnd?>\n";
988 close($xsltwriter);
989
990 undef $self->{'xslt_writer'};
991
992}
993
994sub close_file_output
995{
996 my ($self) = @_;
997
998 # make sure that the handle has been opened - it won't be if we failed
999 # to import any documents...
1000 if (defined(fileno(GROUPPROCESS))) {
1001 $self->output_xml_footer('GROUPPROCESS','Archive');
1002 close GROUPPROCESS;
1003 }
1004
1005 my $OID = $self->{'gs_OID'};
1006 my $short_doc_file = $self->{'short_doc_file'};
1007
1008 if ($self->{'gzip'}) {
1009 my $doc_file = $self->{'gs_filename'};
1010 `gzip $doc_file`;
1011 $doc_file .= ".gz";
1012 $short_doc_file .= ".gz";
1013 if (!&FileUtils::fileExists($doc_file)) {
1014 my $outhandle = $self->{'output_handle'};
1015 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
1016 return 0;
1017 }
1018 }
1019
1020 # store reference in output_info
1021 my $output_info = $self->{'output_info'};
1022 return 0 if (!defined $output_info);
1023 $output_info->add_info($OID, $short_doc_file, undef, undef);
1024 return 1;
1025}
1026
1027
1028#the subclass should implement this method if is_group method could return 1.
1029sub close_group_output{
1030 my $self = shift (@_);
1031}
1032
1033sub is_group {
1034 my $self = shift (@_);
1035 return 0;
1036}
1037
1038my $dc_set = { Title => 1,
1039 Creator => 1,
1040 Subject => 1,
1041 Description => 1,
1042 Publisher => 1,
1043 Contributor => 1,
1044 Date => 1,
1045 Type => 1,
1046 Format => 1,
1047 Identifier => 1,
1048 Source => 1,
1049 Language => 1,
1050 Relation => 1,
1051 Coverage => 1,
1052 Rights => 1};
1053
1054
1055# returns an XML representation of the dublin core metadata
1056# if dc meta is not found, try ex meta
1057# This method is not used by the DSpacePlugout, which has its
1058# own method to save its dc metadata
1059sub get_dc_metadata {
1060 my $self = shift(@_);
1061 my ($doc_obj, $section, $version) = @_;
1062
1063 # build up string of dublin core metadata
1064 $section="" unless defined $section;
1065
1066 my $section_ptr = $doc_obj->_lookup_section($section);
1067 return "" unless defined $section_ptr;
1068
1069
1070 my $explicit_dc = {};
1071 my $explicit_ex_dc = {};
1072 my $explicit_ex = {};
1073
1074 my $all_text="";
1075
1076 # We want high quality dc metadata to go in first, so we store all the
1077 # assigned dc.* values first. Then, for all those dc metadata names in
1078 # the official dc set that are as yet unassigned, we look to see whether
1079 # embedded ex.dc.* metadata has defined some values for them. If not,
1080 # then for the same missing dc metadata names, we look in ex metadata.
1081
1082 foreach my $data (@{$section_ptr->{'metadata'}}){
1083 my $escaped_value = &docprint::escape_text($data->[1]);
1084 if ($data->[0]=~ m/^dc\./) {
1085 $data->[0] =~ tr/[A-Z]/[a-z]/;
1086
1087 $data->[0] =~ m/^dc\.(.*)/;
1088 my $dc_element = $1;
1089
1090 if (!defined $explicit_dc->{$dc_element}) {
1091 $explicit_dc->{$dc_element} = [];
1092 }
1093 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1094
1095 if (defined $version && ($version eq "oai_dc")) {
1096 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1097 }
1098 else {
1099 # qualifier???
1100 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1101 }
1102
1103 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1104 $data->[0] =~ m/^ex\.dc\.(.*)/;
1105 my $ex_dc_element = $1;
1106 my $lc_ex_dc_element = lc($ex_dc_element);
1107
1108 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1109 if (defined $dc_set->{$ex_dc_element}) {
1110 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1111 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1112 }
1113 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1114 }
1115 }
1116 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1117 $data->[0] =~ m/^(ex\.)?(.*)/;
1118 my $ex_element = $2;
1119 my $lc_ex_element = lc($ex_element);
1120
1121 if (defined $dc_set->{$ex_element}) {
1122 if (!defined $explicit_ex->{$lc_ex_element}) {
1123 $explicit_ex->{$lc_ex_element} = [];
1124 }
1125 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1126 }
1127 }
1128 }
1129
1130 # go through dc_set and for any element *not* defined in explicit_dc
1131 # that does exist in explicit_ex, add it in as metadata
1132 foreach my $k ( keys %$dc_set ) {
1133 my $lc_k = lc($k);
1134
1135 if (!defined $explicit_dc->{$lc_k}) {
1136 # try to find if ex.dc.* defines this dc.* meta,
1137 # if not, then look for whether there's an ex.* equivalent
1138
1139 if (defined $explicit_ex_dc->{$lc_k}) {
1140 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1141 my $dc_element = $lc_k;
1142 my $escaped_value = $v;
1143
1144 if (defined $version && ($version eq "oai_dc")) {
1145 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1146 }
1147 else {
1148 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1149 }
1150 }
1151 } elsif (defined $explicit_ex->{$lc_k}) {
1152 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1153 my $dc_element = $lc_k;
1154 my $escaped_value = $v;
1155
1156 if (defined $version && ($version eq "oai_dc")) {
1157 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1158 }
1159 else {
1160 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1161 }
1162 }
1163 }
1164 }
1165 }
1166
1167 if ($all_text eq "") {
1168 $all_text .= " There is no Dublin Core metatdata in this document\n";
1169 }
1170 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1171
1172 return $all_text;
1173}
1174
1175# Build up dublin_core metadata. Priority given to dc.* over ex.*
1176# This method was apparently added by Jeffrey and committed by Shaoqun.
1177# But we don't know why it was added, so not using it anymore.
1178sub new_get_dc_metadata {
1179
1180 my $self = shift(@_);
1181 my ($doc_obj, $section, $version) = @_;
1182
1183 # build up string of dublin core metadata
1184 $section="" unless defined $section;
1185
1186 my $section_ptr=$doc_obj->_lookup_section($section);
1187 return "" unless defined $section_ptr;
1188
1189 my $all_text = "";
1190 foreach my $data (@{$section_ptr->{'metadata'}}){
1191 my $escaped_value = &docprint::escape_text($data->[1]);
1192 my $dc_element = $data->[0];
1193
1194 my @array = split('\.',$dc_element);
1195 my ($type,$name);
1196
1197 if(defined $array[1])
1198 {
1199 $type = $array[0];
1200 $name = $array[1];
1201 }
1202 else
1203 {
1204 $type = "ex";
1205 $name = $array[0];
1206 }
1207
1208 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1209 }
1210 return $all_text;
1211}
1212
1213
12141;
Note: See TracBrowser for help on using the repository browser.