source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 32159

Last change on this file since 32159 was 32159, checked in by ak19, 6 years ago

incremental building was not being incremental when no metadata was assigned to any of the files (as happens with our docs for quick test collections). A default metadata.xml is present, but 'empty' in that it contains no FileSet elements with metadata elements assigned to FileName elements. But we still want incremental behaviour. The idea was to write out an entry into archiveinf-src.db for each metadata.xml processed, not just for each meta.xml file actually referencing a doc, as BasePlugout was doing so far on a per doc basis. Kathy come up with the actual infrastructure that can make it work (to ensure all the necessary objects are available), Dr Bainbridge approved this, and it's now been added into the code.

  • Property svn:keywords set to Author Date Id Revision
File size: 37.7 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40use sorttools;
41
42# suppress the annoying "subroutine redefined" warning that various
43# gets cause under perl 5.6
44$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
45
46my $arguments = [
47 { 'name' => "xslt_file",
48 'desc' => "{BasPlugout.xslt_file}",
49 'type' => "string",
50 'reqd' => "no",
51 'deft' => "",
52 'hiddengli' => "no"},
53 { 'name' => "subdir_split_length",
54 'desc' => "{BasPlugout.subdir_split_length}",
55 'type' => "int",
56 'reqd' => "no",
57 'deft' => "8",
58 'hiddengli' => "no"},
59 { 'name' => "subdir_hash_prefix",
60 'desc' => "{BasPlugout.subdir_hash_prefix}",
61 'type' => "flag",
62 'reqd' => "no",
63 'deft' => "0",
64 'hiddengli' => "no"},
65 { 'name' => "gzip_output",
66 'desc' => "{BasPlugout.gzip_output}",
67 'type' => "flag",
68 'reqd' => "no",
69 'hiddengli' => "no"},
70 { 'name' => "verbosity",
71 'desc' => "{BasPlugout.verbosity}",
72 'type' => "int",
73 'deft' => "0",
74 'reqd' => "no",
75 'hiddengli' => "no"},
76 { 'name' => "output_info",
77 'desc' => "{BasPlugout.output_info}",
78 'type' => "string",
79 'reqd' => "yes",
80 'hiddengli' => "yes"},
81 { 'name' => "output_handle",
82 'desc' => "{BasPlugout.output_handle}",
83 'type' => "string",
84 'deft' => 'STDERR',
85 'reqd' => "no",
86 'hiddengli' => "yes"},
87 { 'name' => "debug",
88 'desc' => "{BasPlugout.debug}",
89 'type' => "flag",
90 'reqd' => "no",
91 'hiddengli' => "yes"},
92 { 'name' => 'no_rss',
93 'desc' => "{BasPlugout.no_rss}",
94 'type' => 'flag',
95 'reqd' => 'no',
96 'hiddengli' => 'yes'},
97 { 'name' => 'rss_title',
98 'desc' => "{BasPlugout.rss_title}",
99 'type' => 'string',
100 'deft' => 'dc.Title',
101 'reqd' => 'no',
102 'hiddengli' => 'yes'},
103 { 'name' => "no_auxiliary_databases",
104 'desc' => "{BasPlugout.no_auxiliary_databases}",
105 'type' => "flag",
106 'reqd' => "no",
107 'hiddengli' => "yes"}
108
109];
110
111my $options = { 'name' => "BasePlugout",
112 'desc' => "{BasPlugout.desc}",
113 'abstract' => "yes",
114 'inherits' => "no",
115 'args' => $arguments};
116
117sub new
118{
119 my $class = shift (@_);
120
121 my ($plugoutlist,$args,$hashArgOptLists) = @_;
122 push(@$plugoutlist, $class);
123
124 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
125
126 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
127 push(@{$hashArgOptLists->{"OptList"}},$options);
128
129 my $self = {};
130 $self->{'plugout_type'} = $class;
131 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
132 $self->{"info_only"} = 0;
133
134 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
135 # the args, just return the object.
136 foreach my $strArg (@{$args})
137 {
138 if(defined $strArg && $strArg eq "-gsdlinfo")
139 {
140 $self->{"info_only"} = 1;
141 return bless $self, $class;
142 }
143 }
144
145 delete $self->{"info_only"};
146
147 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
148 {
149 my $classTempClass = bless $self, $class;
150 print STDERR "<BadPlugout d=$plugout_name>\n";
151 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
152 $classTempClass->print_txt_usage(""); # Use default resource bundle
153 die "\n";
154 }
155
156
157 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
158 {
159 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
160 if (!defined $full_file_path) {
161 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
162 die "\n";
163 }
164 $self->{'xslt_file'} = $full_file_path;
165 }
166
167 # for group processing
168 $self->{'gs_count'} = 0;
169 $self->{'group_position'} = 1;
170
171 $self->{'keep_import_structure'} = 0;
172
173 $self->{'generate_databases'} = 1;
174 if ($self->{'no_auxiliary_databases'}) {
175 $self->{'generate_databases'} = 0;
176 }
177 undef $self->{'no_auxiliary_databases'};
178 return bless $self, $class;
179
180}
181
182# implement this in subclass if you want to do some initialization after
183# loading and setting parameters, and before processing the documents.
184sub begin {
185
186 my $self= shift (@_);
187
188}
189sub print_xml_usage
190{
191 my $self = shift(@_);
192 my $header = shift(@_);
193 my $high_level_information_only = shift(@_);
194
195 # XML output is always in UTF-8
196 gsprintf::output_strings_in_UTF8;
197
198 if ($header) {
199 &PrintUsage::print_xml_header("plugout");
200 }
201 $self->print_xml($high_level_information_only);
202}
203
204
205sub print_xml
206{
207 my $self = shift(@_);
208 my $high_level_information_only = shift(@_);
209
210 my $optionlistref = $self->{'option_list'};
211 my @optionlist = @$optionlistref;
212 my $plugoutoptions = shift(@$optionlistref);
213 return if (!defined($plugoutoptions));
214
215 gsprintf(STDERR, "<PlugoutInfo>\n");
216 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
217 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
218 $desc =~ s/</&amp;lt;/g; # doubly escaped
219 $desc =~ s/>/&amp;gt;/g;
220 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
222 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
223 unless (defined($high_level_information_only)) {
224 gsprintf(STDERR, " <Arguments>\n");
225 if (defined($plugoutoptions->{'args'})) {
226 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
227 }
228 gsprintf(STDERR, " </Arguments>\n");
229
230 # Recurse up the plugout hierarchy
231 $self->print_xml();
232 }
233 gsprintf(STDERR, "</PlugoutInfo>\n");
234}
235
236
237sub print_txt_usage
238{
239 my $self = shift(@_);
240
241 # Print the usage message for a plugout (recursively)
242 my $descoffset = $self->determine_description_offset(0);
243 $self->print_plugout_usage($descoffset, 1);
244}
245
246sub determine_description_offset
247{
248 my $self = shift(@_);
249 my $maxoffset = shift(@_);
250
251 my $optionlistref = $self->{'option_list'};
252 my @optionlist = @$optionlistref;
253 my $plugoutoptions = pop(@$optionlistref);
254 return $maxoffset if (!defined($plugoutoptions));
255
256 # Find the length of the longest option string of this download
257 my $plugoutargs = $plugoutoptions->{'args'};
258 if (defined($plugoutargs)) {
259 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
260 if ($longest > $maxoffset) {
261 $maxoffset = $longest;
262 }
263 }
264
265 # Recurse up the download hierarchy
266 $maxoffset = $self->determine_description_offset($maxoffset);
267 $self->{'option_list'} = \@optionlist;
268 return $maxoffset;
269}
270
271
272sub print_plugout_usage
273{
274 my $self = shift(@_);
275 my $descoffset = shift(@_);
276 my $isleafclass = shift(@_);
277
278 my $optionlistref = $self->{'option_list'};
279 my @optionlist = @$optionlistref;
280 my $plugoutoptions = shift(@$optionlistref);
281 return if (!defined($plugoutoptions));
282
283 my $plugoutname = $plugoutoptions->{'name'};
284 my $plugoutargs = $plugoutoptions->{'args'};
285 my $plugoutdesc = $plugoutoptions->{'desc'};
286
287 # Produce the usage information using the data structure above
288 if ($isleafclass) {
289 if (defined($plugoutdesc)) {
290 gsprintf(STDERR, "$plugoutdesc\n\n");
291 }
292 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
293 }
294
295 # Display the download options, if there are some
296 if (defined($plugoutargs)) {
297 # Calculate the column offset of the option descriptions
298 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
299
300 if ($isleafclass) {
301 gsprintf(STDERR, " {common.specific_options}:\n");
302 }
303 else {
304 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
305 }
306
307 # Display the download options
308 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
309 }
310
311 # Recurse up the download hierarchy
312 $self->print_plugout_usage($descoffset, 0);
313 $self->{'option_list'} = \@optionlist;
314}
315
316
317sub error
318{
319 my ($strFunctionName,$strError) = @_;
320 {
321 print "Error occoured in BasePlugout.pm\n".
322 "In Function: ".$strFunctionName."\n".
323 "Error Message: ".$strError."\n";
324 exit(-1);
325 }
326}
327
328# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
329sub set_OIDtype {
330 my $self = shift (@_);
331 my ($type, $metadata) = @_;
332
333 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
334 $self->{'OIDtype'} = $type;
335 } else {
336 $self->{'OIDtype'} = "hash";
337 }
338 if ($type =~ /^assigned$/) {
339 if (defined $metadata) {
340 $self->{'OIDmetadata'} = $metadata;
341 } else {
342 $self->{'OIDmetadata'} = "dc.Identifier";
343 }
344 }
345}
346
347sub set_output_dir
348{
349 my $self = shift @_;
350 my ($output_dir) = @_;
351
352 $self->{'output_dir'} = $output_dir;
353}
354
355sub setoutputdir
356{
357 my $self = shift @_;
358 my ($output_dir) = @_;
359
360 $self->{'output_dir'} = $output_dir;
361}
362
363sub get_output_dir
364{
365 my $self = shift (@_);
366
367 return $self->{'output_dir'};
368}
369
370sub getoutputdir
371{
372 my $self = shift (@_);
373
374 return $self->{'output_dir'};
375}
376
377sub getoutputinfo
378{
379 my $self = shift (@_);
380
381 return $self->{'output_info'};
382}
383
384
385sub get_output_handler
386{
387 my $self = shift (@_);
388
389 my ($output_file_name) = @_;
390
391 my $fh;
392 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
393
394 return $fh;
395}
396
397sub release_output_handler
398{
399 my $self = shift (@_);
400 my ($outhandler) = @_;
401
402 close($outhandler);
403
404}
405
406sub output_xml_header {
407 my $self = shift (@_);
408 my ($handle,$docroot,$nondoctype) = @_;
409
410
411 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
412
413 #For Dspace must be UTF in lower case
414 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
415
416 if (!defined $nondoctype){
417 my $doctype = (defined $docroot) ? $docroot : "Section";
418
419 # Used to be '<!DOCTYPE Archive SYSTEM ...'
420
421 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
422 }
423
424 print $handle "<$docroot>\n" if defined $docroot;
425}
426
427sub output_xml_footer {
428 my $self = shift (@_);
429 my ($handle,$docroot) = @_;
430 print $handle "</$docroot>\n" if defined $docroot;
431}
432
433
434sub output_general_xml_header
435{
436 my $self = shift (@_);
437 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
438
439 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
440
441 if (defined $opt_dtd) {
442 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
443 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
444 }
445
446 if (defined $docroot) {
447 my $full_docroot = $docroot;
448 if (defined $opt_attributes) {
449 $full_docroot .= " $opt_attributes";
450 }
451
452 print $handle "<$full_docroot>\n"
453 }
454}
455
456sub output_general_xml_footer
457{
458 output_xml_footer(@_);
459}
460
461# This is called by the plugins after read_into_doc_obj generates the doc_obj.
462sub process {
463 my $self = shift (@_);
464 my ($doc_obj) = @_;
465
466 my $output_info = $self->{'output_info'};
467 return if (!defined $output_info);
468
469 # for OAI purposes
470 $doc_obj->set_lastmodified();
471 $doc_obj->set_oailastmodified();
472
473 # find out which directory to save to
474 my $doc_dir = "";
475 if ($self->is_group()) {
476 $doc_dir = $self->get_group_doc_dir($doc_obj);
477 } else {
478 $doc_dir = $self->get_doc_dir($doc_obj);
479 }
480
481 ##############################
482 # call subclass' saveas method
483 ##############################
484 $self->saveas($doc_obj,$doc_dir);
485
486 # write out data to archiveinf-doc.db
487 if ($self->{'generate_databases'}) {
488 $self->archiveinf_db($doc_obj);
489 }
490 if ($self->is_group()) {
491 $self->{'gs_count'}++; # do we want this for all cases?
492 $self->{'group_position'}++;
493 }
494}
495
496sub store_output_info_reference {
497 my $self = shift (@_);
498 my ($doc_obj) = @_;
499
500 my $output_info = $self->{'output_info'};
501 my $metaname = $self->{'sortmeta'};
502
503 my $group_position;
504 if ($self->is_group()) {
505 $group_position = $self->{'group_position'};
506 }
507 if (!defined $metaname || $metaname !~ /\S/) {
508 my $OID = $doc_obj->get_OID();
509 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
510 return;
511 }
512
513 if ($metaname eq "OID") { # sort by OID
514 my $OID = $doc_obj->get_OID();
515 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
516 return;
517 }
518
519 my $metadata = "";
520 my $top_section = $doc_obj->get_top_section();
521
522 my @commameta_list = split(/,/, $metaname);
523 foreach my $cmn (@commameta_list) {
524 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
525 if ($meta) {
526 # do remove prefix/suffix - this will apply to all values
527 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
528 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
529 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
530 $metadata .= $meta if ($meta);
531 }
532 }
533
534 # store reference in the output_info
535 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
536
537}
538
539
540
541sub saveas {
542 my $self = shift (@_);
543 my ($doc_obj, $doc_dir) = @_;
544
545 die "Basplug::saveas function must be implemented in sub classes\n";
546}
547
548sub get_group_doc_dir {
549 my $self = shift (@_);
550 my ($doc_obj) = @_;
551
552 my $outhandle = $self->{'output_handle'};
553 my $OID = $doc_obj->get_OID();
554 $OID = "NULL" unless defined $OID;
555
556 my $groupsize = $self->{'group_size'};
557 my $gs_count = $self->{'gs_count'};
558 my $open_new_file = (($gs_count % $groupsize)==0);
559
560 my $doc_dir;
561
562 if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
563 # if we have some assoc files, then we will need to start a new file
564 if ($self->{'verbosity'} > 2) {
565 print $outhandle " Starting a archives folder for $OID as it has associated files\n";
566 }
567 $open_new_file = 1;
568 }
569
570 # opening a new file
571 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) {
572 # first we close off the old output
573 if ($gs_count>0)
574 {
575 return if (!$self->close_group_output());
576 }
577
578 # this will create the directory
579 $doc_dir = $self->get_doc_dir ($doc_obj);
580 $self->{'new_doc_dir'} = 1;
581 $self->{'gs_doc_dir'} = $doc_dir;
582 $self->{'group_position'} = 1;
583 }
584 else {
585 $doc_dir = $self->{'gs_doc_dir'};
586 $self->{'new_doc_dir'} = 0;
587 }
588 return $doc_dir;
589
590}
591sub get_doc_dir {
592
593 my $self = shift (@_);
594 my ($doc_obj) = @_;
595
596 my $OID = $doc_obj->get_OID();
597 $OID = "NULL" unless defined $OID;
598
599 my $working_dir = $self->get_output_dir();
600 my $working_info = $self->{'output_info'};
601 return if (!defined $working_info);
602
603 my $doc_info = $working_info->get_info($OID);
604 my $doc_dir = '';
605
606 if (defined $doc_info && scalar(@$doc_info) >= 1)
607 {
608 # This OID already has an archives directory, so use it again
609 $doc_dir = $doc_info->[0];
610 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
611 }
612 elsif ($self->{'keep_import_structure'})
613 {
614 my $source_filename = $doc_obj->get_source_filename();
615 $source_filename = &File::Basename::dirname($source_filename);
616 $source_filename =~ s/[\\\/]+/\//g;
617 $source_filename =~ s/\/$//;
618
619 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
620 }
621
622 # We have to use a new archives directory for this document
623 if ($doc_dir eq "")
624 {
625 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
626 }
627
628 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
629
630 return $doc_dir;
631}
632
633
634## @function get_new_doc_dir()
635#
636# Once a doc object is ready to write to disk (and hence has a nice OID),
637# generate a unique subdirectory to write the information to.
638#
639# - create the directory as part of this call, to try and avoid race conditions
640# found in parallel processing [jmt12]
641#
642# @todo figure out what the rule regarding $work_info->size() is meant to do
643#
644# @todo determine what $self->{'group'} is, and whether it should affect
645# directory creation
646#
647sub get_new_doc_dir
648{
649 my $self = shift (@_);
650 my($working_info,$working_dir,$OID) = @_;
651
652 my $doc_dir = "";
653 my $doc_dir_rest = $OID;
654
655 # remove any \ and / from the OID
656 $doc_dir_rest =~ s/[\\\/]//g;
657
658 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
659 if ($ENV{'GSDLOS'} =~ /^windows$/i)
660 {
661 $doc_dir_rest =~ s/\://g;
662 }
663
664 # we generally create a unique directory by adding consequtive fragments of
665 # the document identifier (split by some predefined length - defaulting to
666 # 8) until we find a directory that doesn't yet exist. Note that directories
667 # that contain a document have a suffix ".dir" (whereas those that contain
668 # only subdirectories have no suffix).
669 my $doc_dir_num = 0; # how many directories deep we are
670 my $created_directory = 0; # have we successfully created a new directory
671 do
672 {
673 # (does this work on windows? - jmt12)
674 if ($doc_dir_num > 0)
675 {
676 $doc_dir .= '/';
677 }
678
679 # the default matching pattern grabs the next 'subdir_split_length'
680 # characters of the OID to act as the next subdirectory
681 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
682
683 # Do we count any "HASH" prefix against the split length limit?
684 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
685 {
686 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
687 }
688
689 # Note the use of 's' to both capture the next chuck of OID and to remove
690 # it from OID at the same time
691 if ($doc_dir_rest =~ s/$pattern//i)
692 {
693 $doc_dir .= $1;
694 $doc_dir_num++;
695
696 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
697 if(!FileUtils::directoryExists($full_doc_dir))
698 {
699 &FileUtils::makeAllDirectories($full_doc_dir);
700 $created_directory = 1;
701 }
702
703 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
704 ###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
705 ###rint STDERR " - rest: $doc_dir_rest\n";
706 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
707 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
708 }
709 }
710 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
711
712 # not unique yet? Add on an incremental suffix until we are unique
713 my $i = 1;
714 my $doc_dir_base = $doc_dir;
715 while ($created_directory == 0)
716 {
717 $doc_dir = $doc_dir_base . '-' . $i;
718 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
719 $i++;
720 }
721
722 # in theory this should never happen
723 if (!$created_directory)
724 {
725 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
726 }
727
728 return $doc_dir . '.dir';
729}
730## get_new_doc_dir()
731
732
733sub process_assoc_files {
734 my $self = shift (@_);
735 my ($doc_obj, $doc_dir, $handle) = @_;
736
737 my $outhandle = $self->{'output_handle'};
738
739 my $output_dir = $self->get_output_dir();
740 return if (!defined $output_dir);
741
742 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
743
744 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
745 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
746
747 my @assoc_files = ();
748 my $filename;;
749
750 my $source_filename = $doc_obj->get_source_filename();
751
752 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
753
754 if (defined $collect_dir) {
755 my $dirsep_regexp = &util::get_os_dirsep();
756
757 if ($collect_dir !~ /$dirsep_regexp$/) {
758 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
759 }
760
761 # This test is never going to fail on Windows -- is this a problem?
762
763 if ($source_filename !~ /^$dirsep_regexp/) {
764 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
765 }
766 }
767
768
769 # set the assocfile path (even if we have no assoc files - need this for lucene)
770 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
771 "assocfilepath",
772 "$doc_dir");
773 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
774 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
775 $dir = "" unless defined $dir;
776
777 my $utf8_real_filename = $assoc_file_rec->[0];
778
779 # for some reasons the image associate file has / before the full path
780 $utf8_real_filename =~ s/^\\(.*)/$1/i;
781
782## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
783 my $real_filename = $utf8_real_filename;
784 $real_filename = &util::downgrade_if_dos_filename($real_filename);
785
786 if (&FileUtils::fileExists($real_filename)) {
787
788 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
789
790 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
791
792 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
793 "gsdlassocfile",
794 "$afile:$assoc_file_rec->[2]:$dir");
795 } elsif ($self->{'verbosity'} > 1) {
796 print $outhandle "BasePlugout::process couldn't copy the associated file " .
797 "$real_filename to $afile\n";
798 }
799 }
800}
801
802
803sub process_metafiles_metadata
804{
805 my $self = shift (@_);
806 my ($doc_obj) = @_;
807
808 my $top_section = $doc_obj->get_top_section();
809 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
810
811 foreach my $metafile_pair (@$metafiles) {
812 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
813
814 $doc_obj->metadata_file($full_metafile,$metafile);
815 }
816
817 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
818}
819
820sub archiveinf_files_to_field
821{
822 my $self = shift(@_);
823 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
824
825 foreach my $file_rec (@$files) {
826 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
827 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
828 # for some reasons the image associate file has / before the full path
829 $real_filename =~ s/^\\(.*)/$1/i;
830
831 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
832
833 if (&FileUtils::fileExists($raw_filename)) {
834
835# if (defined $collect_dir) {
836# my $collect_dir_re_safe = $collect_dir;
837# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
838# $collect_dir_re_safe =~ s/\./\\./g;##
839
840# $real_filename =~ s/^$collect_dir_re_safe//;
841# }
842
843 if (defined $reverse_lookups) {
844 $reverse_lookups->{$real_filename} = 1;
845 }
846
847 if($field =~ m@assoc-file|src-file|meta-file@) {
848 $raw_filename = &util::abspath_to_placeholders($raw_filename);
849 }
850
851### push(@{$oid_files->{$field}},$full_file);
852 push(@{$oid_files->{$field}},$raw_filename);
853 }
854 else {
855 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
856 }
857 }
858}
859
860sub archiveinf_db
861{
862 my $self = shift (@_);
863 my ($doc_obj) = @_;
864
865 my $verbosity = $self->{'verbosity'};
866
867 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
868 if (defined $collect_dir) {
869 my $dirsep_regexp = &util::get_os_dirsep();
870
871 if ($collect_dir !~ /$dirsep_regexp$/) {
872 # ensure there is a slash at the end
873 $collect_dir .= &util::get_dirsep();
874 }
875 }
876
877 my $oid = $doc_obj->get_OID();
878 my $source_filename = $doc_obj->get_unmodified_source_filename();
879 my $working_info = $self->{'output_info'};
880 my $doc_info = $working_info->get_info($oid);
881
882 my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
883 # doc_file is the path to the archive doc.xml. Make sure it has unix
884 # slashes, then if the collection is copied to linux, it can be built without reimport
885 $doc_file =~ s/\\/\//g;
886 my $oid_files = { 'doc-file' => $doc_file,
887 'index-status' => $index_status,
888 'src-file' => $source_filename,
889 'sort-meta' => $sortmeta,
890 'assoc-file' => [],
891 'meta-file' => [] };
892 if (defined $group_position) {
893 $oid_files->{'group-position'} = $group_position;
894 }
895 my $reverse_lookups = { $source_filename => "1" };
896
897
898 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
899 $collect_dir,$oid_files,$reverse_lookups);
900
901
902 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
903 $collect_dir,$oid_files);
904
905 # Get the infodbtype value for this collection from the arcinfo object
906 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
907 my $output_dir = $self->{'output_dir'};
908
909 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
910
911 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
912
913 if (!$self->{'no_rss'})
914 {
915 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
916 my $top_section = $doc_obj->get_top_section();
917
918 # rss_title can be set in collect.cfg as follows:
919 # plugout GreenstoneXMLPlugout -rss_title "dc.Title; ex.Title"
920 # rss_title is a semi-colon or comma-separated list of the metadata field names that should
921 # be consulted in order to obtain a Title (anchor text) for the RSS document link.
922 # If not specified, rss_title will default to dc.Title, and fall back on Untitled
923 my $metafieldnames = $self->{'rss_title'};
924 my @metafieldarray = split(/[,;] ?/,$metafieldnames); # , or ; separator can be followed by an optional space
925 my $titles;
926 #@$titles=(); # at worst @$titles will be (), as get_metadata(dc.Titles) may return ()
927 foreach my $metafieldname (@metafieldarray) {
928 $metafieldname =~ s@^ex\.@@; # if ex.Title, need to get_metadata() on metafieldname=Title
929 $titles = $doc_obj->get_metadata($top_section,$metafieldname);
930
931 if(scalar(@$titles) != 0) { # found at least one title for one metafieldname
932 last; # break out of the loop
933 }
934 }
935
936 # if ex.Title was listed in the metafieldnames, then we'll surely have a value for title for this doc
937 # otherwise, if we have no titles at this point, add in a default of Untitled as this doc's title
938 if(scalar(@$titles) == 0) { #&& $metafieldnames !~ [email protected]@) {
939 push(@$titles, "Untitled");
940 }
941
942 # encode basic html entities like <>"& in the title(s), since the & char can break RSS links
943 for (my $i = 0; $i < scalar(@$titles); $i++) {
944 &ghtml::htmlsafe(@$titles[$i]);
945 }
946
947 my $dc_title = join("; ", @$titles);
948
949 if ($oid_files->{'index-status'} eq "R") {
950 $dc_title .= " (Updated)";
951 }
952
953 my $rss_entry = "<item>\n";
954 $rss_entry .= " <title>$dc_title</title>\n";
955 if(&util::is_gs3()) {
956 $rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
957 } else {
958 $rss_entry .= " <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
959 }
960 $rss_entry .= "</item>";
961
962 if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
963 {
964 my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
965 my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
966 &dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
967 &dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
968 }
969 else
970 {
971 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
972 my $rss_fh;
973 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
974 {
975 print $rss_fh $rss_entry . "\n";
976 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
977 }
978 else
979 {
980 print STDERR "**** Failed to open $rss_filename\n$!\n";
981 }
982 }
983 }
984 }
985
986 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
987 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
988 $oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
989 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
990 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
991 if (defined $oid_files->{'group-position'}) {
992 $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
993 }
994
995 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
996 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
997 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
998
999 foreach my $rl (keys %$reverse_lookups) {
1000 $working_info->add_reverseinfo($rl,$oid);
1001 }
1002
1003 # meta files not set in reverse entry, but need to set the metadata flag
1004 if (defined $doc_obj->get_meta_files()) {
1005 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
1006 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
1007 $working_info->set_meta_file_flag($full_file);
1008 }
1009 }
1010}
1011
1012# This sub is called for every metadata.xml accepted for processing by by MetdataXMLPlugin
1013# and adds an entry into archiveinf-src.db for that file in the form:
1014# [@THISCOLLECTPATH@/import/metadata.xml]
1015# <meta-file>1
1016# This prevents blind reprocessing of the same old docs upon *incremental* building whenever
1017# we encounter a default empty metadata.xml that has no actual <FileSet> content defined.
1018sub add_metaxml_file_entry_to_archiveinfsrc {
1019 my $self = shift (@_);
1020 my ($full_file) = @_;
1021
1022 print STDERR "**** Adding metaxml file entry for full_file: $full_file\n";
1023 my $working_info = $self->{'output_info'};
1024 $working_info->set_meta_file_flag($full_file);
1025}
1026
1027
1028sub set_sortmeta {
1029 my $self = shift (@_);
1030 my ($sortmeta, $removeprefix, $removesuffix) = @_;
1031
1032 $self->{'sortmeta'} = $sortmeta;
1033 if (defined ($removeprefix) && $removeprefix ) {
1034 $removeprefix =~ s/^\^//; # don't need a leading ^
1035 $self->{'removeprefix'} = $removeprefix;
1036 }
1037 if (defined ($removesuffix) && $removesuffix) {
1038 $removesuffix =~ s/\$$//; # don't need a trailing $
1039 $self->{'removesuffix'} = $removesuffix;
1040 }
1041}
1042
1043
1044
1045sub open_xslt_pipe
1046{
1047 my $self = shift @_;
1048 my ($output_file_name, $xslt_file)=@_;
1049
1050 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1051
1052 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1053
1054 my $mapping_file_path = "";
1055
1056 if ($ENV{'GSDLOS'} eq "windows"){
1057 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1058 # this file:/// bit didn't work for me on windows XP
1059 #$xslt_file = "\"file:///".$xslt_file."\"";
1060 #$mapping_file_path = "\"file:///";
1061 }
1062 else{
1063 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1064 }
1065
1066
1067 $java_class_path = "\"".$java_class_path."\"";
1068
1069 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1070
1071 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1072 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1073 $cmd .= "-m $mapping_file_path";
1074 }
1075
1076 open(*XMLWRITER, $cmd)
1077 or die "can't open pipe to xslt: $!";
1078
1079
1080 $self->{'xslt_writer'} = *XMLWRITER;
1081
1082 print XMLWRITER "<?DocStart?>\n";
1083 print XMLWRITER "$output_file_name\n";
1084
1085
1086 }
1087
1088
1089sub close_xslt_pipe
1090{
1091 my $self = shift @_;
1092
1093
1094 return unless defined $self->{'xslt_writer'} ;
1095
1096 my $xsltwriter = $self->{'xslt_writer'};
1097
1098 print $xsltwriter "<?DocEnd?>\n";
1099 close($xsltwriter);
1100
1101 undef $self->{'xslt_writer'};
1102
1103}
1104
1105
1106
1107#the subclass should implement this method if is_group method could return 1.
1108sub close_group_output{
1109 my $self = shift (@_);
1110}
1111
1112sub is_group {
1113 my $self = shift (@_);
1114 return 0;
1115}
1116
1117my $dc_set = { Title => 1,
1118 Creator => 1,
1119 Subject => 1,
1120 Description => 1,
1121 Publisher => 1,
1122 Contributor => 1,
1123 Date => 1,
1124 Type => 1,
1125 Format => 1,
1126 Identifier => 1,
1127 Source => 1,
1128 Language => 1,
1129 Relation => 1,
1130 Coverage => 1,
1131 Rights => 1};
1132
1133
1134# returns an XML representation of the dublin core metadata
1135# if dc meta is not found, try ex meta
1136# This method is not used by the DSpacePlugout, which has its
1137# own method to save its dc metadata
1138sub get_dc_metadata {
1139 my $self = shift(@_);
1140 my ($doc_obj, $section, $version) = @_;
1141
1142 # build up string of dublin core metadata
1143 $section="" unless defined $section;
1144
1145 my $section_ptr = $doc_obj->_lookup_section($section);
1146 return "" unless defined $section_ptr;
1147
1148
1149 my $explicit_dc = {};
1150 my $explicit_ex_dc = {};
1151 my $explicit_ex = {};
1152
1153 my $all_text="";
1154
1155 # We want high quality dc metadata to go in first, so we store all the
1156 # assigned dc.* values first. Then, for all those dc metadata names in
1157 # the official dc set that are as yet unassigned, we look to see whether
1158 # embedded ex.dc.* metadata has defined some values for them. If not,
1159 # then for the same missing dc metadata names, we look in ex metadata.
1160
1161 foreach my $data (@{$section_ptr->{'metadata'}}){
1162 my $escaped_value = &docprint::escape_text($data->[1]);
1163 if ($data->[0]=~ m/^dc\./) {
1164 $data->[0] =~ tr/[A-Z]/[a-z]/;
1165
1166 $data->[0] =~ m/^dc\.(.*)/;
1167 my $dc_element = $1;
1168
1169 if (!defined $explicit_dc->{$dc_element}) {
1170 $explicit_dc->{$dc_element} = [];
1171 }
1172 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1173
1174 if (defined $version && ($version eq "oai_dc")) {
1175 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1176 }
1177 else {
1178 # qualifier???
1179 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1180 }
1181
1182 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1183 $data->[0] =~ m/^ex\.dc\.(.*)/;
1184 my $ex_dc_element = $1;
1185 my $lc_ex_dc_element = lc($ex_dc_element);
1186
1187 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1188 if (defined $dc_set->{$ex_dc_element}) {
1189 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1190 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1191 }
1192 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1193 }
1194 }
1195 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1196 $data->[0] =~ m/^(ex\.)?(.*)/;
1197 my $ex_element = $2;
1198 my $lc_ex_element = lc($ex_element);
1199
1200 if (defined $dc_set->{$ex_element}) {
1201 if (!defined $explicit_ex->{$lc_ex_element}) {
1202 $explicit_ex->{$lc_ex_element} = [];
1203 }
1204 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1205 }
1206 }
1207 }
1208
1209 # go through dc_set and for any element *not* defined in explicit_dc
1210 # that does exist in explicit_ex, add it in as metadata
1211 foreach my $k ( keys %$dc_set ) {
1212 my $lc_k = lc($k);
1213
1214 if (!defined $explicit_dc->{$lc_k}) {
1215 # try to find if ex.dc.* defines this dc.* meta,
1216 # if not, then look for whether there's an ex.* equivalent
1217
1218 if (defined $explicit_ex_dc->{$lc_k}) {
1219 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1220 my $dc_element = $lc_k;
1221 my $escaped_value = $v;
1222
1223 if (defined $version && ($version eq "oai_dc")) {
1224 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1225 }
1226 else {
1227 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1228 }
1229 }
1230 } elsif (defined $explicit_ex->{$lc_k}) {
1231 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1232 my $dc_element = $lc_k;
1233 my $escaped_value = $v;
1234
1235 if (defined $version && ($version eq "oai_dc")) {
1236 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1237 }
1238 else {
1239 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1240 }
1241 }
1242 }
1243 }
1244 }
1245
1246 if ($all_text eq "") {
1247 $all_text .= " There is no Dublin Core metatdata in this document\n";
1248 }
1249 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1250
1251 return $all_text;
1252}
1253
1254# Build up dublin_core metadata. Priority given to dc.* over ex.*
1255# This method was apparently added by Jeffrey and committed by Shaoqun.
1256# But we don't know why it was added, so not using it anymore.
1257sub new_get_dc_metadata {
1258
1259 my $self = shift(@_);
1260 my ($doc_obj, $section, $version) = @_;
1261
1262 # build up string of dublin core metadata
1263 $section="" unless defined $section;
1264
1265 my $section_ptr=$doc_obj->_lookup_section($section);
1266 return "" unless defined $section_ptr;
1267
1268 my $all_text = "";
1269 foreach my $data (@{$section_ptr->{'metadata'}}){
1270 my $escaped_value = &docprint::escape_text($data->[1]);
1271 my $dc_element = $data->[0];
1272
1273 my @array = split('\.',$dc_element);
1274 my ($type,$name);
1275
1276 if(defined $array[1])
1277 {
1278 $type = $array[0];
1279 $name = $array[1];
1280 }
1281 else
1282 {
1283 $type = "ex";
1284 $name = $array[0];
1285 }
1286
1287 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1288 }
1289 return $all_text;
1290}
1291
1292
12931;
Note: See TracBrowser for help on using the repository browser.