source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 32563

Last change on this file since 32563 was 32563, checked in by ak19, 5 years ago
  1. Overhaul of GreenstoneSQLPlugs to handle removeold and incremental delete correctly. And now code also automatically handles 'non-incremental delete' (see mention in ArchivesInfPlugin). The new version no longer does lazy loading for getting the sql db connection in the GS SQL Plugin, as now the connection needs to be active since the start of the plugin to run SQL delete statements on remove_old. So the db connection code for the GS SQL plugin has moved back into its init() method. Lots of changes to gssql.pm (and some flow on effects to the GS SQL Plugout) as when database tables exist and need to be created have changed. 2. Undoing most of the changes of changeset 32555 since we're doing incremental delete and removeold differently and in the correct way now when using the GreenstoneSQLPlugs.
  • Property svn:keywords set to Author Date Id Revision
File size: 38.3 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40use sorttools;
41
42# suppress the annoying "subroutine redefined" warning that various
43# gets cause under perl 5.6
44$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
45
46my $arguments = [
47 { 'name' => "site_name",
48 'desc' => "{BasPlugout.site_name}",
49 'type' => "string",
50 'reqd' => "no",
51 'hiddengli' => "yes" },
52 { 'name' => "xslt_file",
53 'desc' => "{BasPlugout.xslt_file}",
54 'type' => "string",
55 'reqd' => "no",
56 'deft' => "",
57 'hiddengli' => "no"},
58 { 'name' => "subdir_split_length",
59 'desc' => "{BasPlugout.subdir_split_length}",
60 'type' => "int",
61 'reqd' => "no",
62 'deft' => "8",
63 'hiddengli' => "no"},
64 { 'name' => "subdir_hash_prefix",
65 'desc' => "{BasPlugout.subdir_hash_prefix}",
66 'type' => "flag",
67 'reqd' => "no",
68 'deft' => "0",
69 'hiddengli' => "no"},
70 { 'name' => "gzip_output",
71 'desc' => "{BasPlugout.gzip_output}",
72 'type' => "flag",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "verbosity",
76 'desc' => "{BasPlugout.verbosity}",
77 'type' => "int",
78 'deft' => "0",
79 'reqd' => "no",
80 'hiddengli' => "no"},
81 { 'name' => "output_info",
82 'desc' => "{BasPlugout.output_info}",
83 'type' => "string",
84 'reqd' => "yes",
85 'hiddengli' => "yes"},
86 { 'name' => "output_handle",
87 'desc' => "{BasPlugout.output_handle}",
88 'type' => "string",
89 'deft' => 'STDERR',
90 'reqd' => "no",
91 'hiddengli' => "yes"},
92 { 'name' => "debug",
93 'desc' => "{BasPlugout.debug}",
94 'type' => "flag",
95 'reqd' => "no",
96 'hiddengli' => "yes"},
97 { 'name' => 'no_rss',
98 'desc' => "{BasPlugout.no_rss}",
99 'type' => 'flag',
100 'reqd' => 'no',
101 'hiddengli' => 'yes'},
102 { 'name' => 'rss_title',
103 'desc' => "{BasPlugout.rss_title}",
104 'type' => 'string',
105 'deft' => 'dc.Title',
106 'reqd' => 'no',
107 'hiddengli' => 'yes'},
108 { 'name' => "no_auxiliary_databases",
109 'desc' => "{BasPlugout.no_auxiliary_databases}",
110 'type' => "flag",
111 'reqd' => "no",
112 'hiddengli' => "yes"}
113
114];
115
116my $options = { 'name' => "BasePlugout",
117 'desc' => "{BasPlugout.desc}",
118 'abstract' => "yes",
119 'inherits' => "no",
120 'args' => $arguments};
121
122sub new
123{
124 my $class = shift (@_);
125
126 my ($plugoutlist,$args,$hashArgOptLists) = @_;
127 push(@$plugoutlist, $class);
128
129 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
130
131 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
132 push(@{$hashArgOptLists->{"OptList"}},$options);
133
134 my $self = {};
135 $self->{'plugout_type'} = $class;
136 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
137 $self->{"info_only"} = 0;
138
139 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
140 # the args, just return the object.
141 #print STDERR "#### " . join(",", @${args}) . "\n\n";
142 my $v=0;
143 foreach my $strArg (@{$args})
144 {
145 if(defined $strArg) {
146 if($strArg eq "-gsdlinfo")
147 {
148 $self->{"info_only"} = 1;
149 return bless $self, $class;
150 }
151 elsif ($strArg eq "-site_name") {
152 $v = $strArg;
153 }
154 elsif($v eq "-site_name") {
155 $self->{'site_name'} = $strArg;
156 }
157 }
158 }
159
160 delete $self->{"info_only"};
161
162 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
163 {
164 my $classTempClass = bless $self, $class;
165 print STDERR "<BadPlugout d=$plugout_name>\n";
166 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
167 $classTempClass->print_txt_usage(""); # Use default resource bundle
168 die "\n";
169 }
170
171
172 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
173 {
174 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
175 if (!defined $full_file_path) {
176 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
177 die "\n";
178 }
179 $self->{'xslt_file'} = $full_file_path;
180 }
181
182 # for group processing
183 $self->{'gs_count'} = 0;
184 $self->{'group_position'} = 1;
185
186 $self->{'keep_import_structure'} = 0;
187
188 $self->{'generate_databases'} = 1;
189 if ($self->{'no_auxiliary_databases'}) {
190 $self->{'generate_databases'} = 0;
191 }
192 undef $self->{'no_auxiliary_databases'};
193 return bless $self, $class;
194
195}
196
197# implement this in subclass if you want to do some initialization after
198# loading and setting parameters, and before processing the documents.
199sub begin {
200
201 my $self= shift (@_);
202
203}
204# implement in subclasses if it needs some non-group related cleanup (post-group cleanup
205# Like begin(), end() is also called by inexport.pm
206sub end {
207 my $self= shift (@_);
208
209}
210sub print_xml_usage
211{
212 my $self = shift(@_);
213 my $header = shift(@_);
214 my $high_level_information_only = shift(@_);
215
216 # XML output is always in UTF-8
217 gsprintf::output_strings_in_UTF8;
218
219 if ($header) {
220 &PrintUsage::print_xml_header("plugout");
221 }
222 $self->print_xml($high_level_information_only);
223}
224
225
226sub print_xml
227{
228 my $self = shift(@_);
229 my $high_level_information_only = shift(@_);
230
231 my $optionlistref = $self->{'option_list'};
232 my @optionlist = @$optionlistref;
233 my $plugoutoptions = shift(@$optionlistref);
234 return if (!defined($plugoutoptions));
235
236 gsprintf(STDERR, "<PlugoutInfo>\n");
237 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
238 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
239 $desc =~ s/</&amp;lt;/g; # doubly escaped
240 $desc =~ s/>/&amp;gt;/g;
241 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
242 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
243 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
244 unless (defined($high_level_information_only)) {
245 gsprintf(STDERR, " <Arguments>\n");
246 if (defined($plugoutoptions->{'args'})) {
247 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
248 }
249 gsprintf(STDERR, " </Arguments>\n");
250
251 # Recurse up the plugout hierarchy
252 $self->print_xml();
253 }
254 gsprintf(STDERR, "</PlugoutInfo>\n");
255}
256
257
258sub print_txt_usage
259{
260 my $self = shift(@_);
261
262 # Print the usage message for a plugout (recursively)
263 my $descoffset = $self->determine_description_offset(0);
264 $self->print_plugout_usage($descoffset, 1);
265}
266
267sub determine_description_offset
268{
269 my $self = shift(@_);
270 my $maxoffset = shift(@_);
271
272 my $optionlistref = $self->{'option_list'};
273 my @optionlist = @$optionlistref;
274 my $plugoutoptions = pop(@$optionlistref);
275 return $maxoffset if (!defined($plugoutoptions));
276
277 # Find the length of the longest option string of this download
278 my $plugoutargs = $plugoutoptions->{'args'};
279 if (defined($plugoutargs)) {
280 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
281 if ($longest > $maxoffset) {
282 $maxoffset = $longest;
283 }
284 }
285
286 # Recurse up the download hierarchy
287 $maxoffset = $self->determine_description_offset($maxoffset);
288 $self->{'option_list'} = \@optionlist;
289 return $maxoffset;
290}
291
292
293sub print_plugout_usage
294{
295 my $self = shift(@_);
296 my $descoffset = shift(@_);
297 my $isleafclass = shift(@_);
298
299 my $optionlistref = $self->{'option_list'};
300 my @optionlist = @$optionlistref;
301 my $plugoutoptions = shift(@$optionlistref);
302 return if (!defined($plugoutoptions));
303
304 my $plugoutname = $plugoutoptions->{'name'};
305 my $plugoutargs = $plugoutoptions->{'args'};
306 my $plugoutdesc = $plugoutoptions->{'desc'};
307
308 # Produce the usage information using the data structure above
309 if ($isleafclass) {
310 if (defined($plugoutdesc)) {
311 gsprintf(STDERR, "$plugoutdesc\n\n");
312 }
313 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
314 }
315
316 # Display the download options, if there are some
317 if (defined($plugoutargs)) {
318 # Calculate the column offset of the option descriptions
319 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
320
321 if ($isleafclass) {
322 gsprintf(STDERR, " {common.specific_options}:\n");
323 }
324 else {
325 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
326 }
327
328 # Display the download options
329 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
330 }
331
332 # Recurse up the download hierarchy
333 $self->print_plugout_usage($descoffset, 0);
334 $self->{'option_list'} = \@optionlist;
335}
336
337
338sub error
339{
340 my ($strFunctionName,$strError) = @_;
341 {
342 print "Error occoured in BasePlugout.pm\n".
343 "In Function: ".$strFunctionName."\n".
344 "Error Message: ".$strError."\n";
345 exit(-1);
346 }
347}
348
349
350# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
351sub set_OIDtype {
352 my $self = shift (@_);
353 my ($type, $metadata) = @_;
354
355 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
356 $self->{'OIDtype'} = $type;
357 } else {
358 $self->{'OIDtype'} = "hash";
359 }
360 if ($type =~ /^assigned$/) {
361 if (defined $metadata) {
362 $self->{'OIDmetadata'} = $metadata;
363 } else {
364 $self->{'OIDmetadata'} = "dc.Identifier";
365 }
366 }
367}
368
369sub set_output_dir
370{
371 my $self = shift @_;
372 my ($output_dir) = @_;
373
374 $self->{'output_dir'} = $output_dir;
375}
376
377sub setoutputdir
378{
379 my $self = shift @_;
380 my ($output_dir) = @_;
381
382 $self->{'output_dir'} = $output_dir;
383}
384
385sub get_output_dir
386{
387 my $self = shift (@_);
388
389 return $self->{'output_dir'};
390}
391
392sub getoutputdir
393{
394 my $self = shift (@_);
395
396 return $self->{'output_dir'};
397}
398
399sub getoutputinfo
400{
401 my $self = shift (@_);
402
403 return $self->{'output_info'};
404}
405
406
407sub get_output_handler
408{
409 my $self = shift (@_);
410
411 my ($output_file_name) = @_;
412
413 my $fh;
414 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
415
416 return $fh;
417}
418
419sub release_output_handler
420{
421 my $self = shift (@_);
422 my ($outhandler) = @_;
423
424 close($outhandler);
425
426}
427
428sub output_xml_header {
429 my $self = shift (@_);
430 my ($handle,$docroot,$nondoctype) = @_;
431
432
433 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
434
435 #For Dspace must be UTF in lower case
436 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
437
438 if (!defined $nondoctype){
439 my $doctype = (defined $docroot) ? $docroot : "Section";
440
441 # Used to be '<!DOCTYPE Archive SYSTEM ...'
442
443 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
444 }
445
446 print $handle "<$docroot>\n" if defined $docroot;
447}
448
449sub output_xml_footer {
450 my $self = shift (@_);
451 my ($handle,$docroot) = @_;
452 print $handle "</$docroot>\n" if defined $docroot;
453}
454
455
456sub output_general_xml_header
457{
458 my $self = shift (@_);
459 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
460
461 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
462
463 if (defined $opt_dtd) {
464 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
465 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
466 }
467
468 if (defined $docroot) {
469 my $full_docroot = $docroot;
470 if (defined $opt_attributes) {
471 $full_docroot .= " $opt_attributes";
472 }
473
474 print $handle "<$full_docroot>\n"
475 }
476}
477
478sub output_general_xml_footer
479{
480 output_xml_footer(@_);
481}
482
483# This is called by the plugins after read_into_doc_obj generates the doc_obj.
484sub process {
485 my $self = shift (@_);
486 my ($doc_obj) = @_;
487
488 my $output_info = $self->{'output_info'};
489 return if (!defined $output_info);
490
491 # for OAI purposes
492 $doc_obj->set_lastmodified();
493 $doc_obj->set_oailastmodified();
494
495 # find out which directory to save to
496 my $doc_dir = "";
497 if ($self->is_group()) {
498 $doc_dir = $self->get_group_doc_dir($doc_obj);
499 } else {
500 $doc_dir = $self->get_doc_dir($doc_obj);
501 }
502
503 ##############################
504 # call subclass' saveas method
505 ##############################
506 $self->saveas($doc_obj,$doc_dir);
507
508 # write out data to archiveinf-doc.db
509 if ($self->{'generate_databases'}) {
510 $self->archiveinf_db($doc_obj);
511 }
512 if ($self->is_group()) {
513 $self->{'gs_count'}++; # do we want this for all cases?
514 $self->{'group_position'}++;
515 }
516}
517
518sub store_output_info_reference {
519 my $self = shift (@_);
520 my ($doc_obj) = @_;
521
522 my $output_info = $self->{'output_info'};
523 my $metaname = $self->{'sortmeta'};
524
525 my $group_position;
526 if ($self->is_group()) {
527 $group_position = $self->{'group_position'};
528 }
529 if (!defined $metaname || $metaname !~ /\S/) {
530 my $OID = $doc_obj->get_OID();
531 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
532 return;
533 }
534
535 if ($metaname eq "OID") { # sort by OID
536 my $OID = $doc_obj->get_OID();
537 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
538 return;
539 }
540
541 my $metadata = "";
542 my $top_section = $doc_obj->get_top_section();
543
544 my @commameta_list = split(/,/, $metaname);
545 foreach my $cmn (@commameta_list) {
546 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
547 if ($meta) {
548 # do remove prefix/suffix - this will apply to all values
549 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
550 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
551 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
552 $metadata .= $meta if ($meta);
553 }
554 }
555
556 # store reference in the output_info
557 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
558
559}
560
561
562
563sub saveas {
564 my $self = shift (@_);
565 my ($doc_obj, $doc_dir) = @_;
566
567 die "Basplug::saveas function must be implemented in sub classes\n";
568}
569
570sub get_group_doc_dir {
571 my $self = shift (@_);
572 my ($doc_obj) = @_;
573
574 my $outhandle = $self->{'output_handle'};
575 my $OID = $doc_obj->get_OID();
576 $OID = "NULL" unless defined $OID;
577
578 my $groupsize = $self->{'group_size'};
579 my $gs_count = $self->{'gs_count'};
580 my $open_new_file = (($gs_count % $groupsize)==0);
581
582 my $doc_dir;
583
584 if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
585 # if we have some assoc files, then we will need to start a new file
586 if ($self->{'verbosity'} > 2) {
587 print $outhandle " Starting a archives folder for $OID as it has associated files\n";
588 }
589 $open_new_file = 1;
590 }
591
592 # opening a new file
593 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) {
594 # first we close off the old output
595 if ($gs_count>0)
596 {
597 return if (!$self->close_group_output());
598 }
599
600 # this will create the directory
601 $doc_dir = $self->get_doc_dir ($doc_obj);
602 $self->{'new_doc_dir'} = 1;
603 $self->{'gs_doc_dir'} = $doc_dir;
604 $self->{'group_position'} = 1;
605 }
606 else {
607 $doc_dir = $self->{'gs_doc_dir'};
608 $self->{'new_doc_dir'} = 0;
609 }
610 return $doc_dir;
611
612}
613sub get_doc_dir {
614
615 my $self = shift (@_);
616 my ($doc_obj) = @_;
617
618 my $OID = $doc_obj->get_OID();
619 $OID = "NULL" unless defined $OID;
620
621 my $working_dir = $self->get_output_dir();
622 my $working_info = $self->{'output_info'};
623 return if (!defined $working_info);
624
625 my $doc_info = $working_info->get_info($OID);
626 my $doc_dir = '';
627
628 if (defined $doc_info && scalar(@$doc_info) >= 1)
629 {
630 # This OID already has an archives directory, so use it again
631 $doc_dir = $doc_info->[0];
632 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
633 }
634 elsif ($self->{'keep_import_structure'})
635 {
636 my $source_filename = $doc_obj->get_source_filename();
637 $source_filename = &File::Basename::dirname($source_filename);
638 $source_filename =~ s/[\\\/]+/\//g;
639 $source_filename =~ s/\/$//;
640
641 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
642 }
643
644 # We have to use a new archives directory for this document
645 if ($doc_dir eq "")
646 {
647 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
648 }
649
650 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
651
652 return $doc_dir;
653}
654
655
656## @function get_new_doc_dir()
657#
658# Once a doc object is ready to write to disk (and hence has a nice OID),
659# generate a unique subdirectory to write the information to.
660#
661# - create the directory as part of this call, to try and avoid race conditions
662# found in parallel processing [jmt12]
663#
664# @todo figure out what the rule regarding $work_info->size() is meant to do
665#
666# @todo determine what $self->{'group'} is, and whether it should affect
667# directory creation
668#
669sub get_new_doc_dir
670{
671 my $self = shift (@_);
672 my($working_info,$working_dir,$OID) = @_;
673
674 my $doc_dir = "";
675 my $doc_dir_rest = $OID;
676
677 # remove any \ and / from the OID
678 $doc_dir_rest =~ s/[\\\/]//g;
679
680 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
681 if ($ENV{'GSDLOS'} =~ /^windows$/i)
682 {
683 $doc_dir_rest =~ s/\://g;
684 }
685
686 # we generally create a unique directory by adding consequtive fragments of
687 # the document identifier (split by some predefined length - defaulting to
688 # 8) until we find a directory that doesn't yet exist. Note that directories
689 # that contain a document have a suffix ".dir" (whereas those that contain
690 # only subdirectories have no suffix).
691 my $doc_dir_num = 0; # how many directories deep we are
692 my $created_directory = 0; # have we successfully created a new directory
693 do
694 {
695 # (does this work on windows? - jmt12)
696 if ($doc_dir_num > 0)
697 {
698 $doc_dir .= '/';
699 }
700
701 # the default matching pattern grabs the next 'subdir_split_length'
702 # characters of the OID to act as the next subdirectory
703 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
704
705 # Do we count any "HASH" prefix against the split length limit?
706 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
707 {
708 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
709 }
710
711 # Note the use of 's' to both capture the next chuck of OID and to remove
712 # it from OID at the same time
713 if ($doc_dir_rest =~ s/$pattern//i)
714 {
715 $doc_dir .= $1;
716 $doc_dir_num++;
717
718 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
719 if(!FileUtils::directoryExists($full_doc_dir))
720 {
721 &FileUtils::makeAllDirectories($full_doc_dir);
722 $created_directory = 1;
723 }
724
725 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
726 ###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
727 ###rint STDERR " - rest: $doc_dir_rest\n";
728 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
729 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
730 }
731 }
732 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
733
734 # not unique yet? Add on an incremental suffix until we are unique
735 my $i = 1;
736 my $doc_dir_base = $doc_dir;
737 while ($created_directory == 0)
738 {
739 $doc_dir = $doc_dir_base . '-' . $i;
740 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
741 $i++;
742 }
743
744 # in theory this should never happen
745 if (!$created_directory)
746 {
747 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
748 }
749
750 return $doc_dir . '.dir';
751}
752## get_new_doc_dir()
753
754
755sub process_assoc_files {
756 my $self = shift (@_);
757 my ($doc_obj, $doc_dir, $handle) = @_;
758
759 my $outhandle = $self->{'output_handle'};
760
761 my $output_dir = $self->get_output_dir();
762 return if (!defined $output_dir);
763
764 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
765
766 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
767 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
768
769 my @assoc_files = ();
770 my $filename;;
771
772 my $source_filename = $doc_obj->get_source_filename();
773
774 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
775
776 if (defined $collect_dir) {
777 my $dirsep_regexp = &util::get_os_dirsep();
778
779 if ($collect_dir !~ /$dirsep_regexp$/) {
780 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
781 }
782
783 # This test is never going to fail on Windows -- is this a problem?
784
785 if ($source_filename !~ /^$dirsep_regexp/) {
786 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
787 }
788 }
789
790
791 # set the assocfile path (even if we have no assoc files - need this for lucene)
792 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
793 "assocfilepath",
794 "$doc_dir");
795 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
796 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
797 $dir = "" unless defined $dir;
798
799 my $utf8_real_filename = $assoc_file_rec->[0];
800
801 # for some reasons the image associate file has / before the full path
802 $utf8_real_filename =~ s/^\\(.*)/$1/i;
803
804## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
805 my $real_filename = $utf8_real_filename;
806 $real_filename = &util::downgrade_if_dos_filename($real_filename);
807
808 if (&FileUtils::fileExists($real_filename)) {
809
810 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
811
812 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
813
814 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
815 "gsdlassocfile",
816 "$afile:$assoc_file_rec->[2]:$dir");
817 } elsif ($self->{'verbosity'} > 1) {
818 print $outhandle "BasePlugout::process couldn't copy the associated file " .
819 "$real_filename to $afile\n";
820 }
821 }
822}
823
824
825sub process_metafiles_metadata
826{
827 my $self = shift (@_);
828 my ($doc_obj) = @_;
829
830 my $top_section = $doc_obj->get_top_section();
831 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
832
833 foreach my $metafile_pair (@$metafiles) {
834 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
835
836 $doc_obj->metadata_file($full_metafile,$metafile);
837 }
838
839 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
840}
841
842sub archiveinf_files_to_field
843{
844 my $self = shift(@_);
845 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
846
847 foreach my $file_rec (@$files) {
848 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
849 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
850 # for some reasons the image associate file has / before the full path
851 $real_filename =~ s/^\\(.*)/$1/i;
852
853 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
854
855 if (&FileUtils::fileExists($raw_filename)) {
856
857# if (defined $collect_dir) {
858# my $collect_dir_re_safe = $collect_dir;
859# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
860# $collect_dir_re_safe =~ s/\./\\./g;##
861
862# $real_filename =~ s/^$collect_dir_re_safe//;
863# }
864
865 if (defined $reverse_lookups) {
866 $reverse_lookups->{$real_filename} = 1;
867 }
868
869 if($field =~ m@assoc-file|src-file|meta-file@) {
870 $raw_filename = &util::abspath_to_placeholders($raw_filename);
871 }
872
873### push(@{$oid_files->{$field}},$full_file);
874 push(@{$oid_files->{$field}},$raw_filename);
875 }
876 else {
877 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
878 }
879 }
880}
881
882sub archiveinf_db
883{
884 my $self = shift (@_);
885 my ($doc_obj) = @_;
886
887 my $verbosity = $self->{'verbosity'};
888
889 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
890 if (defined $collect_dir) {
891 my $dirsep_regexp = &util::get_os_dirsep();
892
893 if ($collect_dir !~ /$dirsep_regexp$/) {
894 # ensure there is a slash at the end
895 $collect_dir .= &util::get_dirsep();
896 }
897 }
898
899 my $oid = $doc_obj->get_OID();
900 my $source_filename = $doc_obj->get_unmodified_source_filename();
901 my $working_info = $self->{'output_info'};
902 my $doc_info = $working_info->get_info($oid);
903
904 my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
905 # doc_file is the path to the archive doc.xml. Make sure it has unix
906 # slashes, then if the collection is copied to linux, it can be built without reimport
907 $doc_file =~ s/\\/\//g;
908 my $oid_files = { 'doc-file' => $doc_file,
909 'index-status' => $index_status,
910 'src-file' => $source_filename,
911 'sort-meta' => $sortmeta,
912 'assoc-file' => [],
913 'meta-file' => [] };
914 if (defined $group_position) {
915 $oid_files->{'group-position'} = $group_position;
916 }
917 my $reverse_lookups = { $source_filename => "1" };
918
919
920 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
921 $collect_dir,$oid_files,$reverse_lookups);
922
923
924 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
925 $collect_dir,$oid_files);
926
927 # Get the infodbtype value for this collection from the arcinfo object
928 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
929 my $output_dir = $self->{'output_dir'};
930
931 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
932
933 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
934
935 if (!$self->{'no_rss'})
936 {
937 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
938 my $top_section = $doc_obj->get_top_section();
939
940 # rss_title can be set in collect.cfg as follows:
941 # plugout GreenstoneXMLPlugout -rss_title "dc.Title; ex.Title"
942 # rss_title is a semi-colon or comma-separated list of the metadata field names that should
943 # be consulted in order to obtain a Title (anchor text) for the RSS document link.
944 # If not specified, rss_title will default to dc.Title, and fall back on Untitled
945 my $metafieldnames = $self->{'rss_title'};
946 my @metafieldarray = split(/[,;] ?/,$metafieldnames); # , or ; separator can be followed by an optional space
947 my $titles;
948 #@$titles=(); # at worst @$titles will be (), as get_metadata(dc.Titles) may return ()
949 foreach my $metafieldname (@metafieldarray) {
950 $metafieldname =~ s@^ex\.@@; # if ex.Title, need to get_metadata() on metafieldname=Title
951 $titles = $doc_obj->get_metadata($top_section,$metafieldname);
952
953 if(scalar(@$titles) != 0) { # found at least one title for one metafieldname
954 last; # break out of the loop
955 }
956 }
957
958 # if ex.Title was listed in the metafieldnames, then we'll surely have a value for title for this doc
959 # otherwise, if we have no titles at this point, add in a default of Untitled as this doc's title
960 if(scalar(@$titles) == 0) { #&& $metafieldnames !~ [email protected]@) {
961 push(@$titles, "Untitled");
962 }
963
964 # encode basic html entities like <>"& in the title(s), since the & char can break RSS links
965 for (my $i = 0; $i < scalar(@$titles); $i++) {
966 &ghtml::htmlsafe(@$titles[$i]);
967 }
968
969 my $dc_title = join("; ", @$titles);
970
971 if ($oid_files->{'index-status'} eq "R") {
972 $dc_title .= " (Updated)";
973 }
974
975 my $rss_entry = "<item>\n";
976 $rss_entry .= " <title>$dc_title</title>\n";
977 if(&util::is_gs3()) {
978 $rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
979 } else {
980 $rss_entry .= " <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
981 }
982 $rss_entry .= "</item>";
983
984 if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
985 {
986 my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
987 my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
988 &dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
989 &dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
990 }
991 else
992 {
993 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
994 my $rss_fh;
995 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
996 {
997 print $rss_fh $rss_entry . "\n";
998 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
999 }
1000 else
1001 {
1002 print STDERR "**** Failed to open $rss_filename\n$!\n";
1003 }
1004 }
1005 }
1006 }
1007
1008 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
1009 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
1010 $oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
1011 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
1012 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
1013 if (defined $oid_files->{'group-position'}) {
1014 $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
1015 }
1016
1017 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
1018 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
1019 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
1020
1021 foreach my $rl (keys %$reverse_lookups) {
1022 $working_info->add_reverseinfo($rl,$oid);
1023 }
1024
1025 # meta files not set in reverse entry, but need to set the metadata flag
1026 if (defined $doc_obj->get_meta_files()) {
1027 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
1028 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
1029 $working_info->set_meta_file_flag($full_file);
1030 }
1031 }
1032}
1033
1034# This sub is called for every metadata.xml accepted for processing by by MetdataXMLPlugin
1035# and adds an entry into archiveinf-src.db for that file in the form:
1036# [@THISCOLLECTPATH@/import/metadata.xml]
1037# <meta-file>1
1038# This prevents blind reprocessing of the same old docs upon *incremental* building whenever
1039# we encounter a default empty metadata.xml that has no actual <FileSet> content defined.
1040sub add_metaxml_file_entry_to_archiveinfsrc {
1041 my $self = shift (@_);
1042 my ($full_file) = @_;
1043
1044 print STDERR "**** Adding metaxml file entry for full_file: $full_file\n";
1045 my $working_info = $self->{'output_info'};
1046 $working_info->set_meta_file_flag($full_file);
1047}
1048
1049
1050sub set_sortmeta {
1051 my $self = shift (@_);
1052 my ($sortmeta, $removeprefix, $removesuffix) = @_;
1053
1054 $self->{'sortmeta'} = $sortmeta;
1055 if (defined ($removeprefix) && $removeprefix ) {
1056 $removeprefix =~ s/^\^//; # don't need a leading ^
1057 $self->{'removeprefix'} = $removeprefix;
1058 }
1059 if (defined ($removesuffix) && $removesuffix) {
1060 $removesuffix =~ s/\$$//; # don't need a trailing $
1061 $self->{'removesuffix'} = $removesuffix;
1062 }
1063}
1064
1065
1066
1067sub open_xslt_pipe
1068{
1069 my $self = shift @_;
1070 my ($output_file_name, $xslt_file)=@_;
1071
1072 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1073
1074 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1075
1076 my $mapping_file_path = "";
1077
1078 if ($ENV{'GSDLOS'} eq "windows"){
1079 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1080 # this file:/// bit didn't work for me on windows XP
1081 #$xslt_file = "\"file:///".$xslt_file."\"";
1082 #$mapping_file_path = "\"file:///";
1083 }
1084 else{
1085 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1086 }
1087
1088
1089 $java_class_path = "\"".$java_class_path."\"";
1090
1091 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1092
1093 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1094 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1095 $cmd .= "-m $mapping_file_path";
1096 }
1097
1098 open(*XMLWRITER, $cmd)
1099 or die "can't open pipe to xslt: $!";
1100
1101
1102 $self->{'xslt_writer'} = *XMLWRITER;
1103
1104 print XMLWRITER "<?DocStart?>\n";
1105 print XMLWRITER "$output_file_name\n";
1106
1107
1108 }
1109
1110
1111sub close_xslt_pipe
1112{
1113 my $self = shift @_;
1114
1115
1116 return unless defined $self->{'xslt_writer'} ;
1117
1118 my $xsltwriter = $self->{'xslt_writer'};
1119
1120 print $xsltwriter "<?DocEnd?>\n";
1121 close($xsltwriter);
1122
1123 undef $self->{'xslt_writer'};
1124
1125}
1126
1127
1128
1129#the subclass should implement this method if is_group method could return 1.
1130sub close_group_output{
1131 my $self = shift (@_);
1132}
1133
1134sub is_group {
1135 my $self = shift (@_);
1136 return 0;
1137}
1138
1139my $dc_set = { Title => 1,
1140 Creator => 1,
1141 Subject => 1,
1142 Description => 1,
1143 Publisher => 1,
1144 Contributor => 1,
1145 Date => 1,
1146 Type => 1,
1147 Format => 1,
1148 Identifier => 1,
1149 Source => 1,
1150 Language => 1,
1151 Relation => 1,
1152 Coverage => 1,
1153 Rights => 1};
1154
1155
1156# returns an XML representation of the dublin core metadata
1157# if dc meta is not found, try ex meta
1158# This method is not used by the DSpacePlugout, which has its
1159# own method to save its dc metadata
1160sub get_dc_metadata {
1161 my $self = shift(@_);
1162 my ($doc_obj, $section, $version) = @_;
1163
1164 # build up string of dublin core metadata
1165 $section="" unless defined $section;
1166
1167 my $section_ptr = $doc_obj->_lookup_section($section);
1168 return "" unless defined $section_ptr;
1169
1170
1171 my $explicit_dc = {};
1172 my $explicit_ex_dc = {};
1173 my $explicit_ex = {};
1174
1175 my $all_text="";
1176
1177 # We want high quality dc metadata to go in first, so we store all the
1178 # assigned dc.* values first. Then, for all those dc metadata names in
1179 # the official dc set that are as yet unassigned, we look to see whether
1180 # embedded ex.dc.* metadata has defined some values for them. If not,
1181 # then for the same missing dc metadata names, we look in ex metadata.
1182
1183 foreach my $data (@{$section_ptr->{'metadata'}}){
1184 my $escaped_value = &docprint::escape_text($data->[1]);
1185 if ($data->[0]=~ m/^dc\./) {
1186 $data->[0] =~ tr/[A-Z]/[a-z]/;
1187
1188 $data->[0] =~ m/^dc\.(.*)/;
1189 my $dc_element = $1;
1190
1191 if (!defined $explicit_dc->{$dc_element}) {
1192 $explicit_dc->{$dc_element} = [];
1193 }
1194 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1195
1196 if (defined $version && ($version eq "oai_dc")) {
1197 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1198 }
1199 else {
1200 # qualifier???
1201 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1202 }
1203
1204 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1205 $data->[0] =~ m/^ex\.dc\.(.*)/;
1206 my $ex_dc_element = $1;
1207 my $lc_ex_dc_element = lc($ex_dc_element);
1208
1209 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1210 if (defined $dc_set->{$ex_dc_element}) {
1211 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1212 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1213 }
1214 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1215 }
1216 }
1217 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1218 $data->[0] =~ m/^(ex\.)?(.*)/;
1219 my $ex_element = $2;
1220 my $lc_ex_element = lc($ex_element);
1221
1222 if (defined $dc_set->{$ex_element}) {
1223 if (!defined $explicit_ex->{$lc_ex_element}) {
1224 $explicit_ex->{$lc_ex_element} = [];
1225 }
1226 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1227 }
1228 }
1229 }
1230
1231 # go through dc_set and for any element *not* defined in explicit_dc
1232 # that does exist in explicit_ex, add it in as metadata
1233 foreach my $k ( keys %$dc_set ) {
1234 my $lc_k = lc($k);
1235
1236 if (!defined $explicit_dc->{$lc_k}) {
1237 # try to find if ex.dc.* defines this dc.* meta,
1238 # if not, then look for whether there's an ex.* equivalent
1239
1240 if (defined $explicit_ex_dc->{$lc_k}) {
1241 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1242 my $dc_element = $lc_k;
1243 my $escaped_value = $v;
1244
1245 if (defined $version && ($version eq "oai_dc")) {
1246 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1247 }
1248 else {
1249 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1250 }
1251 }
1252 } elsif (defined $explicit_ex->{$lc_k}) {
1253 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1254 my $dc_element = $lc_k;
1255 my $escaped_value = $v;
1256
1257 if (defined $version && ($version eq "oai_dc")) {
1258 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1259 }
1260 else {
1261 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1262 }
1263 }
1264 }
1265 }
1266 }
1267
1268 if ($all_text eq "") {
1269 $all_text .= " There is no Dublin Core metatdata in this document\n";
1270 }
1271 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1272
1273 return $all_text;
1274}
1275
1276# Build up dublin_core metadata. Priority given to dc.* over ex.*
1277# This method was apparently added by Jeffrey and committed by Shaoqun.
1278# But we don't know why it was added, so not using it anymore.
1279sub new_get_dc_metadata {
1280
1281 my $self = shift(@_);
1282 my ($doc_obj, $section, $version) = @_;
1283
1284 # build up string of dublin core metadata
1285 $section="" unless defined $section;
1286
1287 my $section_ptr=$doc_obj->_lookup_section($section);
1288 return "" unless defined $section_ptr;
1289
1290 my $all_text = "";
1291 foreach my $data (@{$section_ptr->{'metadata'}}){
1292 my $escaped_value = &docprint::escape_text($data->[1]);
1293 my $dc_element = $data->[0];
1294
1295 my @array = split('\.',$dc_element);
1296 my ($type,$name);
1297
1298 if(defined $array[1])
1299 {
1300 $type = $array[0];
1301 $name = $array[1];
1302 }
1303 else
1304 {
1305 $type = "ex";
1306 $name = $array[0];
1307 }
1308
1309 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1310 }
1311 return $all_text;
1312}
1313
1314
13151;
Note: See TracBrowser for help on using the repository browser.