source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27880

Last change on this file since 27880 was 27880, checked in by ak19, 11 years ago

Intermediate commit. Bugfix to what broke ISIS/Marc tutorials. The archives doc dirs for each of the many records were not being created. This is an intermediate commit that fixes the bug in the usual situation. Dr Bainbridge has a better solution in mind that, once John has created the parallel building FileUtils subroutines for, will work in the parallel case too.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.8 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "false",
96 'hiddengli' => "no"},
97 { 'name' => 'no_rss',
98 'desc' => '{BasPlugout.no_rss}',
99 'type' => 'flag',
100 'reqd' => 'no',
101 'hiddengli' => 'yes'}
102];
103
104my $options = { 'name' => "BasePlugout",
105 'desc' => "{BasPlugout.desc}",
106 'abstract' => "yes",
107 'inherits' => "no",
108 'args' => $arguments};
109
110sub new
111{
112 my $class = shift (@_);
113
114 my ($plugoutlist,$args,$hashArgOptLists) = @_;
115 push(@$plugoutlist, $class);
116
117 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
118
119 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120 push(@{$hashArgOptLists->{"OptList"}},$options);
121
122 my $self = {};
123 $self->{'plugout_type'} = $class;
124 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
125 $self->{"info_only"} = 0;
126
127 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
128 # the args, just return the object.
129 foreach my $strArg (@{$args})
130 {
131 if(defined $strArg && $strArg eq "-gsdlinfo")
132 {
133 $self->{"info_only"} = 1;
134 return bless $self, $class;
135 }
136 }
137
138 delete $self->{"info_only"};
139
140 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
141 {
142 my $classTempClass = bless $self, $class;
143 print STDERR "<BadPlugout d=$plugout_name>\n";
144 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
145 $classTempClass->print_txt_usage(""); # Use default resource bundle
146 die "\n";
147 }
148
149
150 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
151 {
152 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
153 if (!defined $full_file_path) {
154 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
155 die "\n";
156 }
157 $self->{'xslt_file'} = $full_file_path;
158 }
159
160 $self->{'gs_count'} = 0;
161
162 $self->{'keep_import_structure'} = 0;
163
164 return bless $self, $class;
165
166}
167
168sub print_xml_usage
169{
170 my $self = shift(@_);
171 my $header = shift(@_);
172 my $high_level_information_only = shift(@_);
173
174 # XML output is always in UTF-8
175 gsprintf::output_strings_in_UTF8;
176
177 if ($header) {
178 &PrintUsage::print_xml_header("plugout");
179 }
180 $self->print_xml($high_level_information_only);
181}
182
183
184sub print_xml
185{
186 my $self = shift(@_);
187 my $high_level_information_only = shift(@_);
188
189 my $optionlistref = $self->{'option_list'};
190 my @optionlist = @$optionlistref;
191 my $plugoutoptions = shift(@$optionlistref);
192 return if (!defined($plugoutoptions));
193
194 gsprintf(STDERR, "<PlugoutInfo>\n");
195 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
196 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
197 $desc =~ s/</&amp;lt;/g; # doubly escaped
198 $desc =~ s/>/&amp;gt;/g;
199 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
200 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
201 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
202 unless (defined($high_level_information_only)) {
203 gsprintf(STDERR, " <Arguments>\n");
204 if (defined($plugoutoptions->{'args'})) {
205 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
206 }
207 gsprintf(STDERR, " </Arguments>\n");
208
209 # Recurse up the plugout hierarchy
210 $self->print_xml();
211 }
212 gsprintf(STDERR, "</PlugoutInfo>\n");
213}
214
215
216sub print_txt_usage
217{
218 my $self = shift(@_);
219
220 # Print the usage message for a plugout (recursively)
221 my $descoffset = $self->determine_description_offset(0);
222 $self->print_plugout_usage($descoffset, 1);
223}
224
225sub determine_description_offset
226{
227 my $self = shift(@_);
228 my $maxoffset = shift(@_);
229
230 my $optionlistref = $self->{'option_list'};
231 my @optionlist = @$optionlistref;
232 my $plugoutoptions = pop(@$optionlistref);
233 return $maxoffset if (!defined($plugoutoptions));
234
235 # Find the length of the longest option string of this download
236 my $plugoutargs = $plugoutoptions->{'args'};
237 if (defined($plugoutargs)) {
238 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
239 if ($longest > $maxoffset) {
240 $maxoffset = $longest;
241 }
242 }
243
244 # Recurse up the download hierarchy
245 $maxoffset = $self->determine_description_offset($maxoffset);
246 $self->{'option_list'} = \@optionlist;
247 return $maxoffset;
248}
249
250
251sub print_plugout_usage
252{
253 my $self = shift(@_);
254 my $descoffset = shift(@_);
255 my $isleafclass = shift(@_);
256
257 my $optionlistref = $self->{'option_list'};
258 my @optionlist = @$optionlistref;
259 my $plugoutoptions = shift(@$optionlistref);
260 return if (!defined($plugoutoptions));
261
262 my $plugoutname = $plugoutoptions->{'name'};
263 my $plugoutargs = $plugoutoptions->{'args'};
264 my $plugoutdesc = $plugoutoptions->{'desc'};
265
266 # Produce the usage information using the data structure above
267 if ($isleafclass) {
268 if (defined($plugoutdesc)) {
269 gsprintf(STDERR, "$plugoutdesc\n\n");
270 }
271 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
272 }
273
274 # Display the download options, if there are some
275 if (defined($plugoutargs)) {
276 # Calculate the column offset of the option descriptions
277 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
278
279 if ($isleafclass) {
280 gsprintf(STDERR, " {common.specific_options}:\n");
281 }
282 else {
283 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
284 }
285
286 # Display the download options
287 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
288 }
289
290 # Recurse up the download hierarchy
291 $self->print_plugout_usage($descoffset, 0);
292 $self->{'option_list'} = \@optionlist;
293}
294
295
296sub error
297{
298 my ($strFunctionName,$strError) = @_;
299 {
300 print "Error occoured in BasePlugout.pm\n".
301 "In Function: ".$strFunctionName."\n".
302 "Error Message: ".$strError."\n";
303 exit(-1);
304 }
305}
306
307# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
308sub set_OIDtype {
309 my $self = shift (@_);
310 my ($type, $metadata) = @_;
311
312 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
313 $self->{'OIDtype'} = $type;
314 } else {
315 $self->{'OIDtype'} = "hash";
316 }
317 if ($type =~ /^assigned$/) {
318 if (defined $metadata) {
319 $self->{'OIDmetadata'} = $metadata;
320 } else {
321 $self->{'OIDmetadata'} = "dc.Identifier";
322 }
323 }
324}
325
326sub set_output_dir
327{
328 my $self = shift @_;
329 my ($output_dir) = @_;
330
331 $self->{'output_dir'} = $output_dir;
332}
333
334sub setoutputdir
335{
336 my $self = shift @_;
337 my ($output_dir) = @_;
338
339 $self->{'output_dir'} = $output_dir;
340}
341
342sub get_output_dir
343{
344 my $self = shift (@_);
345
346 return $self->{'output_dir'};
347}
348
349sub getoutputdir
350{
351 my $self = shift (@_);
352
353 return $self->{'output_dir'};
354}
355
356sub getoutputinfo
357{
358 my $self = shift (@_);
359
360 return $self->{'output_info'};
361}
362
363
364sub get_output_handler
365{
366 my $self = shift (@_);
367
368 my ($output_file_name) = @_;
369
370 my $fh;
371 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
372
373 return $fh;
374}
375
376sub release_output_handler
377{
378 my $self = shift (@_);
379 my ($outhandler) = @_;
380
381 close($outhandler);
382
383}
384
385sub output_xml_header {
386 my $self = shift (@_);
387 my ($handle,$docroot,$nondoctype) = @_;
388
389
390 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
391
392 #For Dspace must be UTF in lower case
393 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
394
395 if (!defined $nondoctype){
396 my $doctype = (defined $docroot) ? $docroot : "Section";
397
398 # Used to be '<!DOCTYPE Archive SYSTEM ...'
399
400 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
401 }
402
403 print $handle "<$docroot>\n" if defined $docroot;
404}
405
406sub output_xml_footer {
407 my $self = shift (@_);
408 my ($handle,$docroot) = @_;
409 print $handle "</$docroot>\n" if defined $docroot;
410}
411
412
413sub output_general_xml_header
414{
415 my $self = shift (@_);
416 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
417
418 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
419
420 if (defined $opt_dtd) {
421 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
422 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
423 }
424
425 if (defined $docroot) {
426 my $full_docroot = $docroot;
427 if (defined $opt_attributes) {
428 $full_docroot .= " $opt_attributes";
429 }
430
431 print $handle "<$full_docroot>\n"
432 }
433}
434
435sub output_general_xml_footer
436{
437 output_xml_footer(@_);
438}
439
440
441sub process {
442 my $self = shift (@_);
443 my ($doc_obj) = @_;
444
445 # for OAI purposes
446 $doc_obj->set_lastmodified();
447 $doc_obj->set_oailastmodified();
448
449 if ($self->{'group_size'} > 1) {
450 $self->group_process ($doc_obj);
451 return;
452 }
453
454 my $OID = $doc_obj->get_OID();
455 $OID = "NULL" unless defined $OID;
456
457 my $top_section = $doc_obj->get_top_section();
458
459 #get document's directory
460 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
461
462 my $output_info = $self->{'output_info'};
463 return if (!defined $output_info);
464
465 ##############################
466 # call subclass' saveas method
467 ##############################
468 $self->saveas($doc_obj,$doc_dir);
469 $self->archiveinf_db($doc_obj,$doc_dir);
470
471}
472
473sub store_output_info_reference {
474 my $self = shift (@_);
475 my ($doc_obj) = @_;
476
477 my $output_info = $self->{'output_info'};
478 my $metaname = $self->{'sortmeta'};
479
480 if (!defined $metaname || $metaname !~ /\S/) {
481 my $OID = $doc_obj->get_OID();
482 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "");
483 return;
484 }
485
486 if ($metaname eq "OID") { # sort by OID
487 my $OID = $doc_obj->get_OID();
488 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID);
489 return;
490 }
491
492 my $metadata = "";
493 my $top_section = $doc_obj->get_top_section();
494
495 my @commameta_list = split(/,/, $metaname);
496 foreach my $cmn (@commameta_list) {
497 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
498 if ($meta) {
499 # do remove prefix/suffix - this will apply to all values
500 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
501 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
502 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
503 $metadata .= $meta if ($meta);
504 }
505 }
506
507 # store reference in the output_info
508 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
509
510}
511
512sub group_process {
513
514 my $self = shift (@_);
515 my ($doc_obj) = @_;
516
517 my $OID = $doc_obj->get_OID();
518 $OID = "NULL" unless defined $OID;
519
520 my $groupsize = $self->{'group_size'};
521 my $gs_count = $self->{'gs_count'};
522 my $open_new_file = (($gs_count % $groupsize)==0);
523 my $outhandle = $self->{'output_handle'};
524
525 # opening a new file, or document has assoicated files => directory needed
526 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
527
528 # The directory the archive file (doc.xml) and all associated files
529 # should end up in
530 my $doc_dir;
531 # If we've determined its time for a new file, open it now
532 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
533 {
534 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
535 # only if opening new file
536 my $output_dir = $self->get_output_dir();
537 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
538 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
539 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
540
541 if ($gs_count>0)
542 {
543 return if (!$self->close_file_output());
544 }
545
546 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
547
548
549 $self->{'gs_filename'} = $doc_file;
550 $self->{'short_doc_file'} = $short_doc_file;
551 $self->{'gs_OID'} = $OID;
552 $self->{'gs_doc_dir'} = $doc_dir;
553
554 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
555 }
556 # Otherwise load the same archive document directory used last time
557 else
558 {
559 $doc_dir = $self->{'gs_doc_dir'};
560 }
561
562 # copy all the associated files, add this information as metadata
563 # to the document
564 print $outhandle "Writing associated files to $doc_dir\n";
565 $self->process_assoc_files ($doc_obj, $doc_dir);
566
567 # look up 'gsdlmetafile' metadata and store that information
568 # explicitly in $doc_obj
569 $self->process_metafiles_metadata ($doc_obj);
570 }
571
572 # save this document
573 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
574 print GROUPPROCESS $section_text;
575
576 $self->{'gs_count'}++;
577}
578
579
580sub saveas {
581 my $self = shift (@_);
582
583 die "Basplug::saveas function must be implemented in sub classes\n";
584}
585
586sub get_doc_dir {
587 my $self = shift (@_);
588 my ($OID, $source_filename) = @_;
589
590 my $working_dir = $self->get_output_dir();
591 my $working_info = $self->{'output_info'};
592 return if (!defined $working_info);
593
594 my $doc_info = $working_info->get_info($OID);
595 my $doc_dir = '';
596
597 if (defined $doc_info && scalar(@$doc_info) >= 1)
598 {
599 # This OID already has an archives directory, so use it again
600 $doc_dir = $doc_info->[0];
601 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
602 }
603 elsif ($self->{'keep_import_structure'})
604 {
605 $source_filename = &File::Basename::dirname($source_filename);
606 $source_filename =~ s/[\\\/]+/\//g;
607 $source_filename =~ s/\/$//;
608
609 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
610 }
611
612 # We have to use a new archives directory for this document
613 if ($doc_dir eq "")
614 {
615 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
616 }
617
618 if (!defined $self->{'group'} || !$self->{'group'}){
619 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
620 }
621
622 return $doc_dir;
623}
624
625# Before parallel building, this was the method that created the new doc dirs in archives
626sub get_new_doc_dir_OLD {
627 my $self = shift (@_);
628 my($working_info,$working_dir,$OID) = @_;
629
630
631 my $doc_dir = "";
632 my $doc_dir_rest = $OID;
633
634 # remove any \ and / from the OID
635 $doc_dir_rest =~ s/[\\\/]//g;
636
637 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
638 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
639
640 my $doc_dir_num = 0;
641
642 do {
643 $doc_dir .= "/" if $doc_dir_num > 0;
644 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
645 if ($self->{'subdir_hash_prefix'})
646 {
647 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
648 }
649 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
650 if ($doc_dir_rest =~ s/$pattern//i)
651 {
652 $doc_dir .= $1;
653 $doc_dir_num++;
654 }
655 } while ($doc_dir_rest ne "" &&
656 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
657 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
658 my $i = 1;
659 my $doc_dir_base = $doc_dir;
660 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
661 $doc_dir = "$doc_dir_base-$i";
662 $i++;
663 }
664
665 return "$doc_dir.dir";
666}
667
668## @function get_new_doc_dir()
669#
670# Once a doc object is ready to write to disk (and hence has a nice OID),
671# generate a unique subdirectory to write the information to.
672# - create the directory as part of this call, to try and avoid race conditions
673# found in parallel processing [jmt12]
674#
675sub get_new_doc_dir
676{
677 my $self = shift (@_);
678 my($working_info,$working_dir,$OID) = @_;
679
680 my $doc_dir = "";
681 my $doc_dir_rest = $OID;
682
683 # remove any \ and / from the OID
684 $doc_dir_rest =~ s/[\\\/]//g;
685
686 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
687 if ($ENV{'GSDLOS'} =~ /^windows$/i)
688 {
689 $doc_dir_rest =~ s/\://g;
690 }
691
692 # we generally create a unique directory by adding consequtive fragments of
693 # the document identifier (split by some predefined length - defaulting to
694 # 8) until we find a directory that doesn't yet exist. Note that directories
695 # that contain a document have a suffix ".dir" (whereas those that contain
696 # only subdirectories have no suffix).
697 my $doc_dir_num = 0; # how many directories deep we are
698 my $created_directory = 0; # have we successfully created a new directory
699 do
700 {
701 # (does this work on windows? - jmt12)
702 if ($doc_dir_num > 0)
703 {
704 $doc_dir .= '/';
705 }
706 # the default matching pattern grabs the next 'subdir_split_length'
707 # characters of the OID to act as the next subdirectory
708 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
709 # Do we count any "HASH" prefix against the split length limit?
710 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
711 {
712 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
713 }
714 # Note the use of 's' to both capture the next chuck of OID and to remove
715 # it from OID at the same time
716 if ($doc_dir_rest =~ s/$pattern//i)
717 {
718 $doc_dir .= $1;
719 $doc_dir_num++;
720 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
721 if(!FileUtils::directoryExists($full_doc_dir)) {
722 &FileUtils::makeAllDirectories($full_doc_dir);
723 $created_directory = 1;
724 }
725 }
726 ###my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
727 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
728 ###rint STDERR " - create directory: $new_dir => $created_directory\n";
729 ###rint STDERR " - rest: $doc_dir_rest\n";
730 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
731 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
732 }
733 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
734
735 # not unique yet? Add on an incremental suffix until we are unique
736 my $i = 1;
737 my $doc_dir_base = $doc_dir;
738 while ($created_directory == 0)
739 {
740 $doc_dir = $doc_dir_base . '-' . $i;
741 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
742 $i++;
743 }
744
745 # in theory this should never happen
746 if (!$created_directory)
747 {
748 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
749 }
750
751 return $doc_dir . '.dir';
752}
753## get_new_doc_dir()
754
755
756sub process_assoc_files {
757 my $self = shift (@_);
758 my ($doc_obj, $doc_dir, $handle) = @_;
759
760 my $outhandle = $self->{'output_handle'};
761
762 my $output_dir = $self->get_output_dir();
763 return if (!defined $output_dir);
764
765 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
766
767 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
768 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
769
770 my @assoc_files = ();
771 my $filename;;
772
773 my $source_filename = $doc_obj->get_source_filename();
774
775 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
776
777 if (defined $collect_dir) {
778 my $dirsep_regexp = &util::get_os_dirsep();
779
780 if ($collect_dir !~ /$dirsep_regexp$/) {
781 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
782 }
783
784 # This test is never going to fail on Windows -- is this a problem?
785
786 if ($source_filename !~ /^$dirsep_regexp/) {
787 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
788 }
789 }
790
791
792 # set the assocfile path (even if we have no assoc files - need this for lucene)
793 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
794 "assocfilepath",
795 "$doc_dir");
796 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
797 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
798 $dir = "" unless defined $dir;
799
800 my $utf8_real_filename = $assoc_file_rec->[0];
801
802 # for some reasons the image associate file has / before the full path
803 $utf8_real_filename =~ s/^\\(.*)/$1/i;
804
805## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
806 my $real_filename = $utf8_real_filename;
807 $real_filename = &util::downgrade_if_dos_filename($real_filename);
808
809 if (&FileUtils::fileExists($real_filename)) {
810
811 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
812
813 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
814
815 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
816 "gsdlassocfile",
817 "$afile:$assoc_file_rec->[2]:$dir");
818 } elsif ($self->{'verbosity'} > 1) {
819 print $outhandle "BasePlugout::process couldn't copy the associated file " .
820 "$real_filename to $afile\n";
821 }
822 }
823}
824
825
826sub process_metafiles_metadata
827{
828 my $self = shift (@_);
829 my ($doc_obj) = @_;
830
831 my $top_section = $doc_obj->get_top_section();
832 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
833
834 foreach my $metafile_pair (@$metafiles) {
835 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
836
837 $doc_obj->metadata_file($full_metafile,$metafile);
838 }
839
840 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
841}
842
843sub archiveinf_files_to_field
844{
845 my $self = shift(@_);
846 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
847
848 foreach my $file_rec (@$files) {
849 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
850 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
851 # for some reasons the image associate file has / before the full path
852 $real_filename =~ s/^\\(.*)/$1/i;
853
854 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
855
856 if (&FileUtils::fileExists($raw_filename)) {
857
858# if (defined $collect_dir) {
859# my $collect_dir_re_safe = $collect_dir;
860# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
861# $collect_dir_re_safe =~ s/\./\\./g;##
862
863# $real_filename =~ s/^$collect_dir_re_safe//;
864# }
865
866 if (defined $reverse_lookups) {
867 $reverse_lookups->{$real_filename} = 1;
868 }
869### push(@{$oid_files->{$field}},$full_file);
870 push(@{$oid_files->{$field}},$raw_filename);
871 }
872 else {
873 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
874 }
875 }
876}
877
878sub archiveinf_db
879{
880 my $self = shift (@_);
881 my ($doc_obj) = @_;
882
883 my $verbosity = $self->{'verbosity'};
884
885 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
886 if (defined $collect_dir) {
887 my $dirsep_regexp = &util::get_os_dirsep();
888
889 if ($collect_dir !~ /$dirsep_regexp$/) {
890 # ensure there is a slash at the end
891 $collect_dir .= &util::get_dirsep();
892 }
893 }
894
895 my $oid = $doc_obj->get_OID();
896 my $source_filename = $doc_obj->get_unmodified_source_filename();
897 my $working_info = $self->{'output_info'};
898 my $doc_info = $working_info->get_info($oid);
899
900 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
901 # doc_file is the path to the archive doc.xml. Make sure it has unix
902 # slashes, then if the collection is copied to linux, it can be built without reimport
903 $doc_file =~ s/\\/\//g;
904 my $oid_files = { 'doc-file' => $doc_file,
905 'index-status' => $index_status,
906 'src-file' => $source_filename,
907 'sort-meta' => $sortmeta,
908 'assoc-file' => [],
909 'meta-file' => [] };
910
911 my $reverse_lookups = { $source_filename => "1" };
912
913
914 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
915 $collect_dir,$oid_files,$reverse_lookups);
916
917
918 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
919 $collect_dir,$oid_files);
920
921 # Get the infodbtype value for this collection from the arcinfo object
922 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
923 my $output_dir = $self->{'output_dir'};
924
925 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
926 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
927
928 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
929
930 if (!$self->{'no_rss'})
931 {
932 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
933 my $top_section = $doc_obj->get_top_section();
934
935 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
936 my $dc_title = join("; ", @$dc_titles);
937
938 if ($oid_files->{'index-status'} eq "R") {
939 $dc_title .= " (Updated)";
940 }
941
942 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
943 my $rss_fh;
944 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh)) {
945 print $rss_fh "<item>\n";
946 print $rss_fh " <title>$dc_title</title>\n";
947 print $rss_fh " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
948 print $rss_fh "</item>\n";
949 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
950 }
951 else {
952 print STDERR "**** Failed to open $rss_filename\n$!\n";
953 }
954 }
955 }
956
957 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
958 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
959 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
960 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
961
962 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
963 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
964 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
965
966 foreach my $rl (keys %$reverse_lookups) {
967 $working_info->add_reverseinfo($rl,$oid);
968 }
969
970 # meta files not set in reverese entry, but need to set the metadata flag
971 if (defined $doc_obj->get_meta_files()) {
972 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
973 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
974 $working_info->set_meta_file_flag($full_file);
975 }
976 }
977}
978
979
980sub set_sortmeta {
981 my $self = shift (@_);
982 my ($sortmeta, $removeprefix, $removesuffix) = @_;
983
984 $self->{'sortmeta'} = $sortmeta;
985 if (defined ($removeprefix) && $removeprefix ) {
986 $removeprefix =~ s/^\^//; # don't need a leading ^
987 $self->{'removeprefix'} = $removeprefix;
988 }
989 if (defined ($removesuffix) && $removesuffix) {
990 $removesuffix =~ s/\$$//; # don't need a trailing $
991 $self->{'removesuffix'} = $removesuffix;
992 }
993}
994
995sub open_xslt_pipe
996{
997 my $self = shift @_;
998 my ($output_file_name, $xslt_file)=@_;
999
1000 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1001
1002 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1003
1004 my $mapping_file_path = "";
1005
1006 if ($ENV{'GSDLOS'} eq "windows"){
1007 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1008 # this file:/// bit didn't work for me on windows XP
1009 #$xslt_file = "\"file:///".$xslt_file."\"";
1010 #$mapping_file_path = "\"file:///";
1011 }
1012 else{
1013 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1014 }
1015
1016
1017 $java_class_path = "\"".$java_class_path."\"";
1018
1019 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1020
1021 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1022 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1023 $cmd .= "-m $mapping_file_path";
1024 }
1025
1026 open(*XMLWRITER, $cmd)
1027 or die "can't open pipe to xslt: $!";
1028
1029
1030 $self->{'xslt_writer'} = *XMLWRITER;
1031
1032 print XMLWRITER "<?DocStart?>\n";
1033 print XMLWRITER "$output_file_name\n";
1034
1035
1036 }
1037
1038
1039sub close_xslt_pipe
1040{
1041 my $self = shift @_;
1042
1043
1044 return unless defined $self->{'xslt_writer'} ;
1045
1046 my $xsltwriter = $self->{'xslt_writer'};
1047
1048 print $xsltwriter "<?DocEnd?>\n";
1049 close($xsltwriter);
1050
1051 undef $self->{'xslt_writer'};
1052
1053}
1054
1055sub close_file_output
1056{
1057 my ($self) = @_;
1058
1059 # make sure that the handle has been opened - it won't be if we failed
1060 # to import any documents...
1061 if (defined(fileno(GROUPPROCESS))) {
1062 $self->output_xml_footer('GROUPPROCESS','Archive');
1063 close GROUPPROCESS;
1064 }
1065
1066 my $OID = $self->{'gs_OID'};
1067 my $short_doc_file = $self->{'short_doc_file'};
1068
1069 if ($self->{'gzip'}) {
1070 my $doc_file = $self->{'gs_filename'};
1071 `gzip $doc_file`;
1072 $doc_file .= ".gz";
1073 $short_doc_file .= ".gz";
1074 if (!&FileUtils::fileExists($doc_file)) {
1075 my $outhandle = $self->{'output_handle'};
1076 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
1077 return 0;
1078 }
1079 }
1080
1081 # store reference in output_info
1082 my $output_info = $self->{'output_info'};
1083 return 0 if (!defined $output_info);
1084 $output_info->add_info($OID, $short_doc_file, undef, undef);
1085 return 1;
1086}
1087
1088
1089#the subclass should implement this method if is_group method could return 1.
1090sub close_group_output{
1091 my $self = shift (@_);
1092}
1093
1094sub is_group {
1095 my $self = shift (@_);
1096 return 0;
1097}
1098
1099my $dc_set = { Title => 1,
1100 Creator => 1,
1101 Subject => 1,
1102 Description => 1,
1103 Publisher => 1,
1104 Contributor => 1,
1105 Date => 1,
1106 Type => 1,
1107 Format => 1,
1108 Identifier => 1,
1109 Source => 1,
1110 Language => 1,
1111 Relation => 1,
1112 Coverage => 1,
1113 Rights => 1};
1114
1115
1116# returns an XML representation of the dublin core metadata
1117# if dc meta is not found, try ex meta
1118# This method is not used by the DSpacePlugout, which has its
1119# own method to save its dc metadata
1120sub get_dc_metadata {
1121 my $self = shift(@_);
1122 my ($doc_obj, $section, $version) = @_;
1123
1124 # build up string of dublin core metadata
1125 $section="" unless defined $section;
1126
1127 my $section_ptr = $doc_obj->_lookup_section($section);
1128 return "" unless defined $section_ptr;
1129
1130
1131 my $explicit_dc = {};
1132 my $explicit_ex_dc = {};
1133 my $explicit_ex = {};
1134
1135 my $all_text="";
1136
1137 # We want high quality dc metadata to go in first, so we store all the
1138 # assigned dc.* values first. Then, for all those dc metadata names in
1139 # the official dc set that are as yet unassigned, we look to see whether
1140 # embedded ex.dc.* metadata has defined some values for them. If not,
1141 # then for the same missing dc metadata names, we look in ex metadata.
1142
1143 foreach my $data (@{$section_ptr->{'metadata'}}){
1144 my $escaped_value = &docprint::escape_text($data->[1]);
1145 if ($data->[0]=~ m/^dc\./) {
1146 $data->[0] =~ tr/[A-Z]/[a-z]/;
1147
1148 $data->[0] =~ m/^dc\.(.*)/;
1149 my $dc_element = $1;
1150
1151 if (!defined $explicit_dc->{$dc_element}) {
1152 $explicit_dc->{$dc_element} = [];
1153 }
1154 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1155
1156 if (defined $version && ($version eq "oai_dc")) {
1157 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1158 }
1159 else {
1160 # qualifier???
1161 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1162 }
1163
1164 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1165 $data->[0] =~ m/^ex\.dc\.(.*)/;
1166 my $ex_dc_element = $1;
1167 my $lc_ex_dc_element = lc($ex_dc_element);
1168
1169 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1170 if (defined $dc_set->{$ex_dc_element}) {
1171 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1172 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1173 }
1174 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1175 }
1176 }
1177 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1178 $data->[0] =~ m/^(ex\.)?(.*)/;
1179 my $ex_element = $2;
1180 my $lc_ex_element = lc($ex_element);
1181
1182 if (defined $dc_set->{$ex_element}) {
1183 if (!defined $explicit_ex->{$lc_ex_element}) {
1184 $explicit_ex->{$lc_ex_element} = [];
1185 }
1186 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1187 }
1188 }
1189 }
1190
1191 # go through dc_set and for any element *not* defined in explicit_dc
1192 # that does exist in explicit_ex, add it in as metadata
1193 foreach my $k ( keys %$dc_set ) {
1194 my $lc_k = lc($k);
1195
1196 if (!defined $explicit_dc->{$lc_k}) {
1197 # try to find if ex.dc.* defines this dc.* meta,
1198 # if not, then look for whether there's an ex.* equivalent
1199
1200 if (defined $explicit_ex_dc->{$lc_k}) {
1201 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1202 my $dc_element = $lc_k;
1203 my $escaped_value = $v;
1204
1205 if (defined $version && ($version eq "oai_dc")) {
1206 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1207 }
1208 else {
1209 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1210 }
1211 }
1212 } elsif (defined $explicit_ex->{$lc_k}) {
1213 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1214 my $dc_element = $lc_k;
1215 my $escaped_value = $v;
1216
1217 if (defined $version && ($version eq "oai_dc")) {
1218 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1219 }
1220 else {
1221 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1222 }
1223 }
1224 }
1225 }
1226 }
1227
1228 if ($all_text eq "") {
1229 $all_text .= " There is no Dublin Core metatdata in this document\n";
1230 }
1231 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1232
1233 return $all_text;
1234}
1235
1236# Build up dublin_core metadata. Priority given to dc.* over ex.*
1237# This method was apparently added by Jeffrey and committed by Shaoqun.
1238# But we don't know why it was added, so not using it anymore.
1239sub new_get_dc_metadata {
1240
1241 my $self = shift(@_);
1242 my ($doc_obj, $section, $version) = @_;
1243
1244 # build up string of dublin core metadata
1245 $section="" unless defined $section;
1246
1247 my $section_ptr=$doc_obj->_lookup_section($section);
1248 return "" unless defined $section_ptr;
1249
1250 my $all_text = "";
1251 foreach my $data (@{$section_ptr->{'metadata'}}){
1252 my $escaped_value = &docprint::escape_text($data->[1]);
1253 my $dc_element = $data->[0];
1254
1255 my @array = split('\.',$dc_element);
1256 my ($type,$name);
1257
1258 if(defined $array[1])
1259 {
1260 $type = $array[0];
1261 $name = $array[1];
1262 }
1263 else
1264 {
1265 $type = "ex";
1266 $name = $array[0];
1267 }
1268
1269 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1270 }
1271 return $all_text;
1272}
1273
1274
12751;
Note: See TracBrowser for help on using the repository browser.