source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 27697

Last change on this file since 27697 was 27697, checked in by ak19, 11 years ago

Dr Bainbridge fixed it so that the gdb files generated on Windows for diffcol match those on Linux. This actually involved changing the order in which docids appear in archiveinf-doc. This last needed the newly invented flag -sort to the ArchivesInfPlugin in combination with -sortmeta OID to import.pl

  • Property svn:keywords set to Author Date Id Revision
File size: 35.4 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "false",
96 'hiddengli' => "no"},
97 { 'name' => 'no_rss',
98 'desc' => '{BasPlugout.no_rss}',
99 'type' => 'flag',
100 'reqd' => 'no',
101 'hiddengli' => 'yes'}
102];
103
104my $options = { 'name' => "BasePlugout",
105 'desc' => "{BasPlugout.desc}",
106 'abstract' => "yes",
107 'inherits' => "no",
108 'args' => $arguments};
109
110sub new
111{
112 my $class = shift (@_);
113
114 my ($plugoutlist,$args,$hashArgOptLists) = @_;
115 push(@$plugoutlist, $class);
116
117 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
118
119 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120 push(@{$hashArgOptLists->{"OptList"}},$options);
121
122 my $self = {};
123 $self->{'plugout_type'} = $class;
124 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
125 $self->{"info_only"} = 0;
126
127 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
128 # the args, just return the object.
129 foreach my $strArg (@{$args})
130 {
131 if(defined $strArg && $strArg eq "-gsdlinfo")
132 {
133 $self->{"info_only"} = 1;
134 return bless $self, $class;
135 }
136 }
137
138 delete $self->{"info_only"};
139
140 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
141 {
142 my $classTempClass = bless $self, $class;
143 print STDERR "<BadPlugout d=$plugout_name>\n";
144 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
145 $classTempClass->print_txt_usage(""); # Use default resource bundle
146 die "\n";
147 }
148
149
150 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
151 {
152 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
153 if (!defined $full_file_path) {
154 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
155 die "\n";
156 }
157 $self->{'xslt_file'} = $full_file_path;
158 }
159
160 $self->{'gs_count'} = 0;
161
162 $self->{'keep_import_structure'} = 0;
163
164 return bless $self, $class;
165
166}
167
168sub print_xml_usage
169{
170 my $self = shift(@_);
171 my $header = shift(@_);
172 my $high_level_information_only = shift(@_);
173
174 # XML output is always in UTF-8
175 gsprintf::output_strings_in_UTF8;
176
177 if ($header) {
178 &PrintUsage::print_xml_header("plugout");
179 }
180 $self->print_xml($high_level_information_only);
181}
182
183
184sub print_xml
185{
186 my $self = shift(@_);
187 my $high_level_information_only = shift(@_);
188
189 my $optionlistref = $self->{'option_list'};
190 my @optionlist = @$optionlistref;
191 my $plugoutoptions = shift(@$optionlistref);
192 return if (!defined($plugoutoptions));
193
194 gsprintf(STDERR, "<PlugoutInfo>\n");
195 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
196 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
197 $desc =~ s/</&amp;lt;/g; # doubly escaped
198 $desc =~ s/>/&amp;gt;/g;
199 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
200 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
201 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
202 unless (defined($high_level_information_only)) {
203 gsprintf(STDERR, " <Arguments>\n");
204 if (defined($plugoutoptions->{'args'})) {
205 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
206 }
207 gsprintf(STDERR, " </Arguments>\n");
208
209 # Recurse up the plugout hierarchy
210 $self->print_xml();
211 }
212 gsprintf(STDERR, "</PlugoutInfo>\n");
213}
214
215
216sub print_txt_usage
217{
218 my $self = shift(@_);
219
220 # Print the usage message for a plugout (recursively)
221 my $descoffset = $self->determine_description_offset(0);
222 $self->print_plugout_usage($descoffset, 1);
223}
224
225sub determine_description_offset
226{
227 my $self = shift(@_);
228 my $maxoffset = shift(@_);
229
230 my $optionlistref = $self->{'option_list'};
231 my @optionlist = @$optionlistref;
232 my $plugoutoptions = pop(@$optionlistref);
233 return $maxoffset if (!defined($plugoutoptions));
234
235 # Find the length of the longest option string of this download
236 my $plugoutargs = $plugoutoptions->{'args'};
237 if (defined($plugoutargs)) {
238 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
239 if ($longest > $maxoffset) {
240 $maxoffset = $longest;
241 }
242 }
243
244 # Recurse up the download hierarchy
245 $maxoffset = $self->determine_description_offset($maxoffset);
246 $self->{'option_list'} = \@optionlist;
247 return $maxoffset;
248}
249
250
251sub print_plugout_usage
252{
253 my $self = shift(@_);
254 my $descoffset = shift(@_);
255 my $isleafclass = shift(@_);
256
257 my $optionlistref = $self->{'option_list'};
258 my @optionlist = @$optionlistref;
259 my $plugoutoptions = shift(@$optionlistref);
260 return if (!defined($plugoutoptions));
261
262 my $plugoutname = $plugoutoptions->{'name'};
263 my $plugoutargs = $plugoutoptions->{'args'};
264 my $plugoutdesc = $plugoutoptions->{'desc'};
265
266 # Produce the usage information using the data structure above
267 if ($isleafclass) {
268 if (defined($plugoutdesc)) {
269 gsprintf(STDERR, "$plugoutdesc\n\n");
270 }
271 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
272 }
273
274 # Display the download options, if there are some
275 if (defined($plugoutargs)) {
276 # Calculate the column offset of the option descriptions
277 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
278
279 if ($isleafclass) {
280 gsprintf(STDERR, " {common.specific_options}:\n");
281 }
282 else {
283 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
284 }
285
286 # Display the download options
287 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
288 }
289
290 # Recurse up the download hierarchy
291 $self->print_plugout_usage($descoffset, 0);
292 $self->{'option_list'} = \@optionlist;
293}
294
295
296sub error
297{
298 my ($strFunctionName,$strError) = @_;
299 {
300 print "Error occoured in BasePlugout.pm\n".
301 "In Function: ".$strFunctionName."\n".
302 "Error Message: ".$strError."\n";
303 exit(-1);
304 }
305}
306
307# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
308sub set_OIDtype {
309 my $self = shift (@_);
310 my ($type, $metadata) = @_;
311
312 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
313 $self->{'OIDtype'} = $type;
314 } else {
315 $self->{'OIDtype'} = "hash";
316 }
317 if ($type =~ /^assigned$/) {
318 if (defined $metadata) {
319 $self->{'OIDmetadata'} = $metadata;
320 } else {
321 $self->{'OIDmetadata'} = "dc.Identifier";
322 }
323 }
324}
325
326sub set_output_dir
327{
328 my $self = shift @_;
329 my ($output_dir) = @_;
330
331 $self->{'output_dir'} = $output_dir;
332}
333
334sub setoutputdir
335{
336 my $self = shift @_;
337 my ($output_dir) = @_;
338
339 $self->{'output_dir'} = $output_dir;
340}
341
342sub get_output_dir
343{
344 my $self = shift (@_);
345
346 return $self->{'output_dir'};
347}
348
349sub getoutputdir
350{
351 my $self = shift (@_);
352
353 return $self->{'output_dir'};
354}
355
356sub getoutputinfo
357{
358 my $self = shift (@_);
359
360 return $self->{'output_info'};
361}
362
363
364sub get_output_handler
365{
366 my $self = shift (@_);
367
368 my ($output_file_name) = @_;
369
370 my $fh;
371 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
372
373 return $fh;
374}
375
376sub release_output_handler
377{
378 my $self = shift (@_);
379 my ($outhandler) = @_;
380
381 close($outhandler);
382
383}
384
385sub output_xml_header {
386 my $self = shift (@_);
387 my ($handle,$docroot,$nondoctype) = @_;
388
389
390 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
391
392 #For Dspace must be UTF in lower case
393 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
394
395 if (!defined $nondoctype){
396 my $doctype = (defined $docroot) ? $docroot : "Section";
397
398 # Used to be '<!DOCTYPE Archive SYSTEM ...'
399
400 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
401 }
402
403 print $handle "<$docroot>\n" if defined $docroot;
404}
405
406sub output_xml_footer {
407 my $self = shift (@_);
408 my ($handle,$docroot) = @_;
409 print $handle "</$docroot>\n" if defined $docroot;
410}
411
412
413sub output_general_xml_header
414{
415 my $self = shift (@_);
416 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
417
418 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
419
420 if (defined $opt_dtd) {
421 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
422 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
423 }
424
425 if (defined $docroot) {
426 my $full_docroot = $docroot;
427 if (defined $opt_attributes) {
428 $full_docroot .= " $opt_attributes";
429 }
430
431 print $handle "<$full_docroot>\n"
432 }
433}
434
435sub output_general_xml_footer
436{
437 output_xml_footer(@_);
438}
439
440
441sub process {
442 my $self = shift (@_);
443 my ($doc_obj) = @_;
444
445 # for OAI purposes
446 $doc_obj->set_lastmodified();
447 $doc_obj->set_oailastmodified();
448
449 if ($self->{'group_size'} > 1) {
450 $self->group_process ($doc_obj);
451 return;
452 }
453
454 my $OID = $doc_obj->get_OID();
455 $OID = "NULL" unless defined $OID;
456
457 my $top_section = $doc_obj->get_top_section();
458
459 #get document's directory
460 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
461
462 my $output_info = $self->{'output_info'};
463 return if (!defined $output_info);
464
465 ##############################
466 # call subclass' saveas method
467 ##############################
468 $self->saveas($doc_obj,$doc_dir);
469 $self->archiveinf_db($doc_obj,$doc_dir);
470
471}
472
473sub store_output_info_reference {
474 my $self = shift (@_);
475 my ($doc_obj) = @_;
476
477 my $output_info = $self->{'output_info'};
478 my $metaname = $self->{'sortmeta'};
479
480 if (!defined $metaname || $metaname !~ /\S/) {
481 my $OID = $doc_obj->get_OID();
482 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "");
483 return;
484 }
485
486 if ($metaname eq "OID") { # sort by OID
487 my $OID = $doc_obj->get_OID();
488 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID);
489 return;
490 }
491
492 my $metadata = "";
493 my $top_section = $doc_obj->get_top_section();
494
495 my @commameta_list = split(/,/, $metaname);
496 foreach my $cmn (@commameta_list) {
497 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
498 if ($meta) {
499 # do remove prefix/suffix - this will apply to all values
500 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
501 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
502 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
503 $metadata .= $meta if ($meta);
504 }
505 }
506
507 # store reference in the output_info
508 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
509
510}
511
512sub group_process {
513
514 my $self = shift (@_);
515 my ($doc_obj) = @_;
516
517 my $OID = $doc_obj->get_OID();
518 $OID = "NULL" unless defined $OID;
519
520 my $groupsize = $self->{'group_size'};
521 my $gs_count = $self->{'gs_count'};
522 my $open_new_file = (($gs_count % $groupsize)==0);
523 my $outhandle = $self->{'output_handle'};
524
525 # opening a new file, or document has assoicated files => directory needed
526 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
527
528 # The directory the archive file (doc.xml) and all associated files
529 # should end up in
530 my $doc_dir;
531 # If we've determined its time for a new file, open it now
532 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
533 {
534 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
535 # only if opening new file
536 my $output_dir = $self->get_output_dir();
537 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
538 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
539 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
540
541 if ($gs_count>0)
542 {
543 return if (!$self->close_file_output());
544 }
545
546 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
547
548
549 $self->{'gs_filename'} = $doc_file;
550 $self->{'short_doc_file'} = $short_doc_file;
551 $self->{'gs_OID'} = $OID;
552 $self->{'gs_doc_dir'} = $doc_dir;
553
554 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
555 }
556 # Otherwise load the same archive document directory used last time
557 else
558 {
559 $doc_dir = $self->{'gs_doc_dir'};
560 }
561
562 # copy all the associated files, add this information as metadata
563 # to the document
564 print $outhandle "Writing associated files to $doc_dir\n";
565 $self->process_assoc_files ($doc_obj, $doc_dir);
566
567 # look up 'gsdlmetafile' metadata and store that information
568 # explicitly in $doc_obj
569 $self->process_metafiles_metadata ($doc_obj);
570 }
571
572 # save this document
573 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
574 print GROUPPROCESS $section_text;
575
576 $self->{'gs_count'}++;
577}
578
579
580sub saveas {
581 my $self = shift (@_);
582
583 die "Basplug::saveas function must be implemented in sub classes\n";
584}
585
586sub get_doc_dir {
587 my $self = shift (@_);
588 my ($OID, $source_filename) = @_;
589
590 my $working_dir = $self->get_output_dir();
591 my $working_info = $self->{'output_info'};
592 return if (!defined $working_info);
593
594 my $doc_info = $working_info->get_info($OID);
595 my $doc_dir = '';
596
597 if (defined $doc_info && scalar(@$doc_info) >= 1)
598 {
599 # This OID already has an archives directory, so use it again
600 $doc_dir = $doc_info->[0];
601 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
602 }
603 elsif ($self->{'keep_import_structure'})
604 {
605 $source_filename = &File::Basename::dirname($source_filename);
606 $source_filename =~ s/[\\\/]+/\//g;
607 $source_filename =~ s/\/$//;
608
609 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
610 }
611
612 # We have to use a new archives directory for this document
613 if ($doc_dir eq "")
614 {
615 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
616 }
617
618 if (!defined $self->{'group'} || !$self->{'group'}){
619 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
620 }
621
622 return $doc_dir;
623}
624
625## @function get_new_doc_dir()
626#
627# Once a doc object is ready to write to disk (and hence has a nice OID),
628# generate a unique subdirectory to write the information to.
629# - create the directory as part of this call, to try and avoid race conditions
630# found in parallel processing [jmt12]
631#
632sub get_new_doc_dir
633{
634 my $self = shift (@_);
635 my($working_info,$working_dir,$OID) = @_;
636
637 my $doc_dir = "";
638 my $doc_dir_rest = $OID;
639
640 # remove any \ and / from the OID
641 $doc_dir_rest =~ s/[\\\/]//g;
642
643 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
644 if ($ENV{'GSDLOS'} =~ /^windows$/i)
645 {
646 $doc_dir_rest =~ s/\://g;
647 }
648
649 # we generally create a unique directory by adding consequtive fragments of
650 # the document identifier (split by some predefined length - defaulting to
651 # 8) until we find a directory that doesn't yet exist. Note that directories
652 # that contain a document have a suffix ".dir" (whereas those that contain
653 # only subdirectories have no suffix).
654 my $doc_dir_num = 0; # how many directories deep we are
655 my $created_directory = 0; # have we successfully created a new directory
656 do
657 {
658 # (does this work on windows? - jmt12)
659 if ($doc_dir_num > 0)
660 {
661 $doc_dir .= '/';
662 }
663 # the default matching pattern grabs the next 'subdir_split_length'
664 # characters of the OID to act as the next subdirectory
665 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
666 # Do we count any "HASH" prefix against the split length limit?
667 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
668 {
669 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
670 }
671 # Note the use of 's' to both capture the next chuck of OID and to remove
672 # it from OID at the same time
673 if ($doc_dir_rest =~ s/$pattern//i)
674 {
675 $doc_dir .= $1;
676 $doc_dir_num++;
677 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
678 }
679 my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
680 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
681 ###rint STDERR " - create directory: $new_dir => $created_directory\n";
682 ###rint STDERR " - rest: $doc_dir_rest\n";
683 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
684 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
685 }
686 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
687
688 # not unique yet? Add on an incremental suffix until we are unique
689 my $i = 1;
690 my $doc_dir_base = $doc_dir;
691 while ($created_directory == 0)
692 {
693 $doc_dir = $doc_dir_base . '-' . $i;
694 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
695 $i++;
696 }
697
698 # in theory this should never happen
699 if (!$created_directory)
700 {
701 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
702 }
703
704 return $doc_dir . '.dir';
705}
706## get_new_doc_dir()
707
708
709sub process_assoc_files {
710 my $self = shift (@_);
711 my ($doc_obj, $doc_dir, $handle) = @_;
712
713 my $outhandle = $self->{'output_handle'};
714
715 my $output_dir = $self->get_output_dir();
716 return if (!defined $output_dir);
717
718 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
719
720 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
721 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
722
723 my @assoc_files = ();
724 my $filename;;
725
726 my $source_filename = $doc_obj->get_source_filename();
727
728 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
729
730 if (defined $collect_dir) {
731 my $dirsep_regexp = &util::get_os_dirsep();
732
733 if ($collect_dir !~ /$dirsep_regexp$/) {
734 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
735 }
736
737 # This test is never going to fail on Windows -- is this a problem?
738
739 if ($source_filename !~ /^$dirsep_regexp/) {
740 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
741 }
742 }
743
744
745 # set the assocfile path (even if we have no assoc files - need this for lucene)
746 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
747 "assocfilepath",
748 "$doc_dir");
749 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
750 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
751 $dir = "" unless defined $dir;
752
753 my $utf8_real_filename = $assoc_file_rec->[0];
754
755 # for some reasons the image associate file has / before the full path
756 $utf8_real_filename =~ s/^\\(.*)/$1/i;
757
758## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
759 my $real_filename = $utf8_real_filename;
760 $real_filename = &util::downgrade_if_dos_filename($real_filename);
761
762 if (&FileUtils::fileExists($real_filename)) {
763
764 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
765
766 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
767
768 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
769 "gsdlassocfile",
770 "$afile:$assoc_file_rec->[2]:$dir");
771 } elsif ($self->{'verbosity'} > 1) {
772 print $outhandle "BasePlugout::process couldn't copy the associated file " .
773 "$real_filename to $afile\n";
774 }
775 }
776}
777
778
779sub process_metafiles_metadata
780{
781 my $self = shift (@_);
782 my ($doc_obj) = @_;
783
784 my $top_section = $doc_obj->get_top_section();
785 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
786
787 foreach my $metafile_pair (@$metafiles) {
788 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
789
790 $doc_obj->metadata_file($full_metafile,$metafile);
791 }
792
793 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
794}
795
796sub archiveinf_files_to_field
797{
798 my $self = shift(@_);
799 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
800
801 foreach my $file_rec (@$files) {
802 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
803 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
804 # for some reasons the image associate file has / before the full path
805 $real_filename =~ s/^\\(.*)/$1/i;
806
807 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
808
809 if (&FileUtils::fileExists($raw_filename)) {
810
811# if (defined $collect_dir) {
812# my $collect_dir_re_safe = $collect_dir;
813# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
814# $collect_dir_re_safe =~ s/\./\\./g;##
815
816# $real_filename =~ s/^$collect_dir_re_safe//;
817# }
818
819 if (defined $reverse_lookups) {
820 $reverse_lookups->{$real_filename} = 1;
821 }
822### push(@{$oid_files->{$field}},$full_file);
823 push(@{$oid_files->{$field}},$raw_filename);
824 }
825 else {
826 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
827 }
828 }
829}
830
831sub archiveinf_db
832{
833 my $self = shift (@_);
834 my ($doc_obj) = @_;
835
836 my $verbosity = $self->{'verbosity'};
837
838 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
839 if (defined $collect_dir) {
840 my $dirsep_regexp = &util::get_os_dirsep();
841
842 if ($collect_dir !~ /$dirsep_regexp$/) {
843 # ensure there is a slash at the end
844 $collect_dir .= &util::get_dirsep();
845 }
846 }
847
848 my $oid = $doc_obj->get_OID();
849 my $source_filename = $doc_obj->get_unmodified_source_filename();
850 my $working_info = $self->{'output_info'};
851 my $doc_info = $working_info->get_info($oid);
852
853 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
854 # doc_file is the path to the archive doc.xml. Make sure it has unix
855 # slashes, then if the collection is copied to linux, it can be built without reimport
856 $doc_file =~ s/\\/\//g;
857 my $oid_files = { 'doc-file' => $doc_file,
858 'index-status' => $index_status,
859 'src-file' => $source_filename,
860 'sort-meta' => $sortmeta,
861 'assoc-file' => [],
862 'meta-file' => [] };
863
864 my $reverse_lookups = { $source_filename => "1" };
865
866
867 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
868 $collect_dir,$oid_files,$reverse_lookups);
869
870
871 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
872 $collect_dir,$oid_files);
873
874 # Get the infodbtype value for this collection from the arcinfo object
875 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
876 my $output_dir = $self->{'output_dir'};
877
878 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
879 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
880
881 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
882
883 if (!$self->{'no_rss'})
884 {
885 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
886 my $top_section = $doc_obj->get_top_section();
887
888 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
889 my $dc_title = join("; ", @$dc_titles);
890
891 if ($oid_files->{'index-status'} eq "R") {
892 $dc_title .= " (Updated)";
893 }
894
895 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
896 my $rss_fh;
897 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh)) {
898 print $rss_fh "<item>\n";
899 print $rss_fh " <title>$dc_title</title>\n";
900 print $rss_fh " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
901 print $rss_fh "</item>\n";
902 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
903 }
904 else {
905 print STDERR "**** Failed to open $rss_filename\n$!\n";
906 }
907 }
908 }
909
910 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
911 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
912 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
913 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
914
915 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
916 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
917 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
918
919 foreach my $rl (keys %$reverse_lookups) {
920 $working_info->add_reverseinfo($rl,$oid);
921 }
922
923 # meta files not set in reverese entry, but need to set the metadata flag
924 if (defined $doc_obj->get_meta_files()) {
925 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
926 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
927 $working_info->set_meta_file_flag($full_file);
928 }
929 }
930}
931
932
933sub set_sortmeta {
934 my $self = shift (@_);
935 my ($sortmeta, $removeprefix, $removesuffix) = @_;
936
937 $self->{'sortmeta'} = $sortmeta;
938 if (defined ($removeprefix) && $removeprefix ) {
939 $removeprefix =~ s/^\^//; # don't need a leading ^
940 $self->{'removeprefix'} = $removeprefix;
941 }
942 if (defined ($removesuffix) && $removesuffix) {
943 $removesuffix =~ s/\$$//; # don't need a trailing $
944 $self->{'removesuffix'} = $removesuffix;
945 }
946}
947
948sub open_xslt_pipe
949{
950 my $self = shift @_;
951 my ($output_file_name, $xslt_file)=@_;
952
953 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
954
955 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
956
957 my $mapping_file_path = "";
958
959 if ($ENV{'GSDLOS'} eq "windows"){
960 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
961 # this file:/// bit didn't work for me on windows XP
962 #$xslt_file = "\"file:///".$xslt_file."\"";
963 #$mapping_file_path = "\"file:///";
964 }
965 else{
966 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
967 }
968
969
970 $java_class_path = "\"".$java_class_path."\"";
971
972 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
973
974 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
975 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
976 $cmd .= "-m $mapping_file_path";
977 }
978
979 open(*XMLWRITER, $cmd)
980 or die "can't open pipe to xslt: $!";
981
982
983 $self->{'xslt_writer'} = *XMLWRITER;
984
985 print XMLWRITER "<?DocStart?>\n";
986 print XMLWRITER "$output_file_name\n";
987
988
989 }
990
991
992sub close_xslt_pipe
993{
994 my $self = shift @_;
995
996
997 return unless defined $self->{'xslt_writer'} ;
998
999 my $xsltwriter = $self->{'xslt_writer'};
1000
1001 print $xsltwriter "<?DocEnd?>\n";
1002 close($xsltwriter);
1003
1004 undef $self->{'xslt_writer'};
1005
1006}
1007
1008sub close_file_output
1009{
1010 my ($self) = @_;
1011
1012 # make sure that the handle has been opened - it won't be if we failed
1013 # to import any documents...
1014 if (defined(fileno(GROUPPROCESS))) {
1015 $self->output_xml_footer('GROUPPROCESS','Archive');
1016 close GROUPPROCESS;
1017 }
1018
1019 my $OID = $self->{'gs_OID'};
1020 my $short_doc_file = $self->{'short_doc_file'};
1021
1022 if ($self->{'gzip'}) {
1023 my $doc_file = $self->{'gs_filename'};
1024 `gzip $doc_file`;
1025 $doc_file .= ".gz";
1026 $short_doc_file .= ".gz";
1027 if (!&FileUtils::fileExists($doc_file)) {
1028 my $outhandle = $self->{'output_handle'};
1029 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
1030 return 0;
1031 }
1032 }
1033
1034 # store reference in output_info
1035 my $output_info = $self->{'output_info'};
1036 return 0 if (!defined $output_info);
1037 $output_info->add_info($OID, $short_doc_file, undef, undef);
1038 return 1;
1039}
1040
1041
1042#the subclass should implement this method if is_group method could return 1.
1043sub close_group_output{
1044 my $self = shift (@_);
1045}
1046
1047sub is_group {
1048 my $self = shift (@_);
1049 return 0;
1050}
1051
1052my $dc_set = { Title => 1,
1053 Creator => 1,
1054 Subject => 1,
1055 Description => 1,
1056 Publisher => 1,
1057 Contributor => 1,
1058 Date => 1,
1059 Type => 1,
1060 Format => 1,
1061 Identifier => 1,
1062 Source => 1,
1063 Language => 1,
1064 Relation => 1,
1065 Coverage => 1,
1066 Rights => 1};
1067
1068
1069# returns an XML representation of the dublin core metadata
1070# if dc meta is not found, try ex meta
1071# This method is not used by the DSpacePlugout, which has its
1072# own method to save its dc metadata
1073sub get_dc_metadata {
1074 my $self = shift(@_);
1075 my ($doc_obj, $section, $version) = @_;
1076
1077 # build up string of dublin core metadata
1078 $section="" unless defined $section;
1079
1080 my $section_ptr = $doc_obj->_lookup_section($section);
1081 return "" unless defined $section_ptr;
1082
1083
1084 my $explicit_dc = {};
1085 my $explicit_ex_dc = {};
1086 my $explicit_ex = {};
1087
1088 my $all_text="";
1089
1090 # We want high quality dc metadata to go in first, so we store all the
1091 # assigned dc.* values first. Then, for all those dc metadata names in
1092 # the official dc set that are as yet unassigned, we look to see whether
1093 # embedded ex.dc.* metadata has defined some values for them. If not,
1094 # then for the same missing dc metadata names, we look in ex metadata.
1095
1096 foreach my $data (@{$section_ptr->{'metadata'}}){
1097 my $escaped_value = &docprint::escape_text($data->[1]);
1098 if ($data->[0]=~ m/^dc\./) {
1099 $data->[0] =~ tr/[A-Z]/[a-z]/;
1100
1101 $data->[0] =~ m/^dc\.(.*)/;
1102 my $dc_element = $1;
1103
1104 if (!defined $explicit_dc->{$dc_element}) {
1105 $explicit_dc->{$dc_element} = [];
1106 }
1107 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1108
1109 if (defined $version && ($version eq "oai_dc")) {
1110 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1111 }
1112 else {
1113 # qualifier???
1114 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1115 }
1116
1117 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1118 $data->[0] =~ m/^ex\.dc\.(.*)/;
1119 my $ex_dc_element = $1;
1120 my $lc_ex_dc_element = lc($ex_dc_element);
1121
1122 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1123 if (defined $dc_set->{$ex_dc_element}) {
1124 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1125 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1126 }
1127 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1128 }
1129 }
1130 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1131 $data->[0] =~ m/^(ex\.)?(.*)/;
1132 my $ex_element = $2;
1133 my $lc_ex_element = lc($ex_element);
1134
1135 if (defined $dc_set->{$ex_element}) {
1136 if (!defined $explicit_ex->{$lc_ex_element}) {
1137 $explicit_ex->{$lc_ex_element} = [];
1138 }
1139 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1140 }
1141 }
1142 }
1143
1144 # go through dc_set and for any element *not* defined in explicit_dc
1145 # that does exist in explicit_ex, add it in as metadata
1146 foreach my $k ( keys %$dc_set ) {
1147 my $lc_k = lc($k);
1148
1149 if (!defined $explicit_dc->{$lc_k}) {
1150 # try to find if ex.dc.* defines this dc.* meta,
1151 # if not, then look for whether there's an ex.* equivalent
1152
1153 if (defined $explicit_ex_dc->{$lc_k}) {
1154 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1155 my $dc_element = $lc_k;
1156 my $escaped_value = $v;
1157
1158 if (defined $version && ($version eq "oai_dc")) {
1159 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1160 }
1161 else {
1162 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1163 }
1164 }
1165 } elsif (defined $explicit_ex->{$lc_k}) {
1166 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1167 my $dc_element = $lc_k;
1168 my $escaped_value = $v;
1169
1170 if (defined $version && ($version eq "oai_dc")) {
1171 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1172 }
1173 else {
1174 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1175 }
1176 }
1177 }
1178 }
1179 }
1180
1181 if ($all_text eq "") {
1182 $all_text .= " There is no Dublin Core metatdata in this document\n";
1183 }
1184 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1185
1186 return $all_text;
1187}
1188
1189# Build up dublin_core metadata. Priority given to dc.* over ex.*
1190# This method was apparently added by Jeffrey and committed by Shaoqun.
1191# But we don't know why it was added, so not using it anymore.
1192sub new_get_dc_metadata {
1193
1194 my $self = shift(@_);
1195 my ($doc_obj, $section, $version) = @_;
1196
1197 # build up string of dublin core metadata
1198 $section="" unless defined $section;
1199
1200 my $section_ptr=$doc_obj->_lookup_section($section);
1201 return "" unless defined $section_ptr;
1202
1203 my $all_text = "";
1204 foreach my $data (@{$section_ptr->{'metadata'}}){
1205 my $escaped_value = &docprint::escape_text($data->[1]);
1206 my $dc_element = $data->[0];
1207
1208 my @array = split('\.',$dc_element);
1209 my ($type,$name);
1210
1211 if(defined $array[1])
1212 {
1213 $type = $array[0];
1214 $name = $array[1];
1215 }
1216 else
1217 {
1218 $type = "ex";
1219 $name = $array[0];
1220 }
1221
1222 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1223 }
1224 return $all_text;
1225}
1226
1227
12281;
Note: See TracBrowser for help on using the repository browser.