source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 28996

Last change on this file since 28996 was 28996, checked in by ak19, 10 years ago

GS3 does not have the new httpdomainHtmlsafe macro introduced into GS2 for the changes for security.

  • Property svn:keywords set to Author Date Id Revision
File size: 35.5 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "xslt_file",
47 'desc' => "{BasPlugout.xslt_file}",
48 'type' => "string",
49 'reqd' => "no",
50 'deft' => "",
51 'hiddengli' => "no"},
52 { 'name' => "subdir_split_length",
53 'desc' => "{BasPlugout.subdir_split_length}",
54 'type' => "int",
55 'reqd' => "no",
56 'deft' => "8",
57 'hiddengli' => "no"},
58 { 'name' => "subdir_hash_prefix",
59 'desc' => "{BasPlugout.subdir_hash_prefix}",
60 'type' => "flag",
61 'reqd' => "no",
62 'deft' => "0",
63 'hiddengli' => "no"},
64 { 'name' => "gzip_output",
65 'desc' => "{BasPlugout.gzip_output}",
66 'type' => "flag",
67 'reqd' => "no",
68 'hiddengli' => "no"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "output_info",
76 'desc' => "{BasPlugout.output_info}",
77 'type' => "string",
78 'reqd' => "yes",
79 'hiddengli' => "yes"},
80 { 'name' => "output_handle",
81 'desc' => "{BasPlugout.output_handle}",
82 'type' => "string",
83 'deft' => 'STDERR',
84 'reqd' => "no",
85 'hiddengli' => "yes"},
86 { 'name' => "debug",
87 'desc' => "{BasPlugout.debug}",
88 'type' => "flag",
89 'reqd' => "no",
90 'hiddengli' => "yes"},
91 { 'name' => 'no_rss',
92 'desc' => "{BasPlugout.no_rss}",
93 'type' => 'flag',
94 'reqd' => 'no',
95 'hiddengli' => 'yes'},
96 { 'name' => "no_auxiliary_databases",
97 'desc' => "{BasPlugout.no_auxiliary_databases}",
98 'type' => "flag",
99 'reqd' => "no",
100 'hiddengli' => "yes"}
101
102];
103
104my $options = { 'name' => "BasePlugout",
105 'desc' => "{BasPlugout.desc}",
106 'abstract' => "yes",
107 'inherits' => "no",
108 'args' => $arguments};
109
110sub new
111{
112 my $class = shift (@_);
113
114 my ($plugoutlist,$args,$hashArgOptLists) = @_;
115 push(@$plugoutlist, $class);
116
117 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
118
119 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120 push(@{$hashArgOptLists->{"OptList"}},$options);
121
122 my $self = {};
123 $self->{'plugout_type'} = $class;
124 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
125 $self->{"info_only"} = 0;
126
127 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
128 # the args, just return the object.
129 foreach my $strArg (@{$args})
130 {
131 if(defined $strArg && $strArg eq "-gsdlinfo")
132 {
133 $self->{"info_only"} = 1;
134 return bless $self, $class;
135 }
136 }
137
138 delete $self->{"info_only"};
139
140 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
141 {
142 my $classTempClass = bless $self, $class;
143 print STDERR "<BadPlugout d=$plugout_name>\n";
144 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
145 $classTempClass->print_txt_usage(""); # Use default resource bundle
146 die "\n";
147 }
148
149
150 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
151 {
152 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
153 if (!defined $full_file_path) {
154 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
155 die "\n";
156 }
157 $self->{'xslt_file'} = $full_file_path;
158 }
159
160 # for group processing
161 $self->{'gs_count'} = 0;
162 $self->{'group_position'} = 1;
163
164 $self->{'keep_import_structure'} = 0;
165
166 $self->{'generate_databases'} = 1;
167 if ($self->{'no_auxiliary_databases'}) {
168 $self->{'generate_databases'} = 0;
169 }
170 undef $self->{'no_auxiliary_databases'};
171 return bless $self, $class;
172
173}
174
175# implement this in subclass if you want to do some initialization after
176# loading and setting parameters, and before processing the documents.
177sub begin {
178
179 my $self= shift (@_);
180
181}
182sub print_xml_usage
183{
184 my $self = shift(@_);
185 my $header = shift(@_);
186 my $high_level_information_only = shift(@_);
187
188 # XML output is always in UTF-8
189 gsprintf::output_strings_in_UTF8;
190
191 if ($header) {
192 &PrintUsage::print_xml_header("plugout");
193 }
194 $self->print_xml($high_level_information_only);
195}
196
197
198sub print_xml
199{
200 my $self = shift(@_);
201 my $high_level_information_only = shift(@_);
202
203 my $optionlistref = $self->{'option_list'};
204 my @optionlist = @$optionlistref;
205 my $plugoutoptions = shift(@$optionlistref);
206 return if (!defined($plugoutoptions));
207
208 gsprintf(STDERR, "<PlugoutInfo>\n");
209 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
210 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
211 $desc =~ s/</&amp;lt;/g; # doubly escaped
212 $desc =~ s/>/&amp;gt;/g;
213 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
214 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
215 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
216 unless (defined($high_level_information_only)) {
217 gsprintf(STDERR, " <Arguments>\n");
218 if (defined($plugoutoptions->{'args'})) {
219 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
220 }
221 gsprintf(STDERR, " </Arguments>\n");
222
223 # Recurse up the plugout hierarchy
224 $self->print_xml();
225 }
226 gsprintf(STDERR, "</PlugoutInfo>\n");
227}
228
229
230sub print_txt_usage
231{
232 my $self = shift(@_);
233
234 # Print the usage message for a plugout (recursively)
235 my $descoffset = $self->determine_description_offset(0);
236 $self->print_plugout_usage($descoffset, 1);
237}
238
239sub determine_description_offset
240{
241 my $self = shift(@_);
242 my $maxoffset = shift(@_);
243
244 my $optionlistref = $self->{'option_list'};
245 my @optionlist = @$optionlistref;
246 my $plugoutoptions = pop(@$optionlistref);
247 return $maxoffset if (!defined($plugoutoptions));
248
249 # Find the length of the longest option string of this download
250 my $plugoutargs = $plugoutoptions->{'args'};
251 if (defined($plugoutargs)) {
252 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
253 if ($longest > $maxoffset) {
254 $maxoffset = $longest;
255 }
256 }
257
258 # Recurse up the download hierarchy
259 $maxoffset = $self->determine_description_offset($maxoffset);
260 $self->{'option_list'} = \@optionlist;
261 return $maxoffset;
262}
263
264
265sub print_plugout_usage
266{
267 my $self = shift(@_);
268 my $descoffset = shift(@_);
269 my $isleafclass = shift(@_);
270
271 my $optionlistref = $self->{'option_list'};
272 my @optionlist = @$optionlistref;
273 my $plugoutoptions = shift(@$optionlistref);
274 return if (!defined($plugoutoptions));
275
276 my $plugoutname = $plugoutoptions->{'name'};
277 my $plugoutargs = $plugoutoptions->{'args'};
278 my $plugoutdesc = $plugoutoptions->{'desc'};
279
280 # Produce the usage information using the data structure above
281 if ($isleafclass) {
282 if (defined($plugoutdesc)) {
283 gsprintf(STDERR, "$plugoutdesc\n\n");
284 }
285 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
286 }
287
288 # Display the download options, if there are some
289 if (defined($plugoutargs)) {
290 # Calculate the column offset of the option descriptions
291 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
292
293 if ($isleafclass) {
294 gsprintf(STDERR, " {common.specific_options}:\n");
295 }
296 else {
297 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
298 }
299
300 # Display the download options
301 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
302 }
303
304 # Recurse up the download hierarchy
305 $self->print_plugout_usage($descoffset, 0);
306 $self->{'option_list'} = \@optionlist;
307}
308
309
310sub error
311{
312 my ($strFunctionName,$strError) = @_;
313 {
314 print "Error occoured in BasePlugout.pm\n".
315 "In Function: ".$strFunctionName."\n".
316 "Error Message: ".$strError."\n";
317 exit(-1);
318 }
319}
320
321# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
322sub set_OIDtype {
323 my $self = shift (@_);
324 my ($type, $metadata) = @_;
325
326 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
327 $self->{'OIDtype'} = $type;
328 } else {
329 $self->{'OIDtype'} = "hash";
330 }
331 if ($type =~ /^assigned$/) {
332 if (defined $metadata) {
333 $self->{'OIDmetadata'} = $metadata;
334 } else {
335 $self->{'OIDmetadata'} = "dc.Identifier";
336 }
337 }
338}
339
340sub set_output_dir
341{
342 my $self = shift @_;
343 my ($output_dir) = @_;
344
345 $self->{'output_dir'} = $output_dir;
346}
347
348sub setoutputdir
349{
350 my $self = shift @_;
351 my ($output_dir) = @_;
352
353 $self->{'output_dir'} = $output_dir;
354}
355
356sub get_output_dir
357{
358 my $self = shift (@_);
359
360 return $self->{'output_dir'};
361}
362
363sub getoutputdir
364{
365 my $self = shift (@_);
366
367 return $self->{'output_dir'};
368}
369
370sub getoutputinfo
371{
372 my $self = shift (@_);
373
374 return $self->{'output_info'};
375}
376
377
378sub get_output_handler
379{
380 my $self = shift (@_);
381
382 my ($output_file_name) = @_;
383
384 my $fh;
385 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
386
387 return $fh;
388}
389
390sub release_output_handler
391{
392 my $self = shift (@_);
393 my ($outhandler) = @_;
394
395 close($outhandler);
396
397}
398
399sub output_xml_header {
400 my $self = shift (@_);
401 my ($handle,$docroot,$nondoctype) = @_;
402
403
404 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
405
406 #For Dspace must be UTF in lower case
407 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
408
409 if (!defined $nondoctype){
410 my $doctype = (defined $docroot) ? $docroot : "Section";
411
412 # Used to be '<!DOCTYPE Archive SYSTEM ...'
413
414 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
415 }
416
417 print $handle "<$docroot>\n" if defined $docroot;
418}
419
420sub output_xml_footer {
421 my $self = shift (@_);
422 my ($handle,$docroot) = @_;
423 print $handle "</$docroot>\n" if defined $docroot;
424}
425
426
427sub output_general_xml_header
428{
429 my $self = shift (@_);
430 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
431
432 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
433
434 if (defined $opt_dtd) {
435 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
436 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
437 }
438
439 if (defined $docroot) {
440 my $full_docroot = $docroot;
441 if (defined $opt_attributes) {
442 $full_docroot .= " $opt_attributes";
443 }
444
445 print $handle "<$full_docroot>\n"
446 }
447}
448
449sub output_general_xml_footer
450{
451 output_xml_footer(@_);
452}
453
454# This is called by the plugins after read_into_doc_obj generates the doc_obj.
455sub process {
456 my $self = shift (@_);
457 my ($doc_obj) = @_;
458
459 my $output_info = $self->{'output_info'};
460 return if (!defined $output_info);
461
462 # for OAI purposes
463 $doc_obj->set_lastmodified();
464 $doc_obj->set_oailastmodified();
465
466 # find out which directory to save to
467 my $doc_dir = "";
468 if ($self->is_group()) {
469 $doc_dir = $self->get_group_doc_dir($doc_obj);
470 } else {
471 $doc_dir = $self->get_doc_dir($doc_obj);
472 }
473
474 ##############################
475 # call subclass' saveas method
476 ##############################
477 $self->saveas($doc_obj,$doc_dir);
478
479 # write out data to archiveinf-doc.db
480 if ($self->{'generate_databases'}) {
481 $self->archiveinf_db($doc_obj);
482 }
483 if ($self->is_group()) {
484 $self->{'gs_count'}++; # do we want this for all cases?
485 $self->{'group_position'}++;
486 }
487}
488
489sub store_output_info_reference {
490 my $self = shift (@_);
491 my ($doc_obj) = @_;
492
493 my $output_info = $self->{'output_info'};
494 my $metaname = $self->{'sortmeta'};
495
496 my $group_position;
497 if ($self->is_group()) {
498 $group_position = $self->{'group_position'};
499 }
500 if (!defined $metaname || $metaname !~ /\S/) {
501 my $OID = $doc_obj->get_OID();
502 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
503 return;
504 }
505
506 if ($metaname eq "OID") { # sort by OID
507 my $OID = $doc_obj->get_OID();
508 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
509 return;
510 }
511
512 my $metadata = "";
513 my $top_section = $doc_obj->get_top_section();
514
515 my @commameta_list = split(/,/, $metaname);
516 foreach my $cmn (@commameta_list) {
517 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
518 if ($meta) {
519 # do remove prefix/suffix - this will apply to all values
520 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
521 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
522 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
523 $metadata .= $meta if ($meta);
524 }
525 }
526
527 # store reference in the output_info
528 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
529
530}
531
532
533
534sub saveas {
535 my $self = shift (@_);
536 my ($doc_obj, $doc_dir) = @_;
537
538 die "Basplug::saveas function must be implemented in sub classes\n";
539}
540
541sub get_group_doc_dir {
542 my $self = shift (@_);
543 my ($doc_obj) = @_;
544
545 my $outhandle = $self->{'output_handle'};
546 my $OID = $doc_obj->get_OID();
547 $OID = "NULL" unless defined $OID;
548
549 my $groupsize = $self->{'group_size'};
550 my $gs_count = $self->{'gs_count'};
551 my $open_new_file = (($gs_count % $groupsize)==0);
552
553 my $doc_dir;
554
555 if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
556 # if we have some assoc files, then we will need to start a new file
557 if ($self->{'verbosity'} > 2) {
558 print $outhandle " Starting a archives folder for $OID as it has associated files\n";
559 }
560 $open_new_file = 1;
561 }
562
563 # opening a new file
564 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) {
565 # first we close off the old output
566 if ($gs_count>0)
567 {
568 return if (!$self->close_group_output());
569 }
570
571 # this will create the directory
572 $doc_dir = $self->get_doc_dir ($doc_obj);
573 $self->{'new_doc_dir'} = 1;
574 $self->{'gs_doc_dir'} = $doc_dir;
575 $self->{'group_position'} = 1;
576 }
577 else {
578 $doc_dir = $self->{'gs_doc_dir'};
579 $self->{'new_doc_dir'} = 0;
580 }
581 return $doc_dir;
582
583}
584sub get_doc_dir {
585
586 my $self = shift (@_);
587 my ($doc_obj) = @_;
588
589 my $OID = $doc_obj->get_OID();
590 $OID = "NULL" unless defined $OID;
591
592 my $working_dir = $self->get_output_dir();
593 my $working_info = $self->{'output_info'};
594 return if (!defined $working_info);
595
596 my $doc_info = $working_info->get_info($OID);
597 my $doc_dir = '';
598
599 if (defined $doc_info && scalar(@$doc_info) >= 1)
600 {
601 # This OID already has an archives directory, so use it again
602 $doc_dir = $doc_info->[0];
603 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
604 }
605 elsif ($self->{'keep_import_structure'})
606 {
607 my $source_filename = $doc_obj->get_source_filename();
608 $source_filename = &File::Basename::dirname($source_filename);
609 $source_filename =~ s/[\\\/]+/\//g;
610 $source_filename =~ s/\/$//;
611
612 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
613 }
614
615 # We have to use a new archives directory for this document
616 if ($doc_dir eq "")
617 {
618 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
619 }
620
621 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
622
623 return $doc_dir;
624}
625
626
627## @function get_new_doc_dir()
628#
629# Once a doc object is ready to write to disk (and hence has a nice OID),
630# generate a unique subdirectory to write the information to.
631#
632# - create the directory as part of this call, to try and avoid race conditions
633# found in parallel processing [jmt12]
634#
635# @todo figure out what the rule regarding $work_info->size() is meant to do
636#
637# @todo determine what $self->{'group'} is, and whether it should affect
638# directory creation
639#
640sub get_new_doc_dir
641{
642 my $self = shift (@_);
643 my($working_info,$working_dir,$OID) = @_;
644
645 my $doc_dir = "";
646 my $doc_dir_rest = $OID;
647
648 # remove any \ and / from the OID
649 $doc_dir_rest =~ s/[\\\/]//g;
650
651 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
652 if ($ENV{'GSDLOS'} =~ /^windows$/i)
653 {
654 $doc_dir_rest =~ s/\://g;
655 }
656
657 # we generally create a unique directory by adding consequtive fragments of
658 # the document identifier (split by some predefined length - defaulting to
659 # 8) until we find a directory that doesn't yet exist. Note that directories
660 # that contain a document have a suffix ".dir" (whereas those that contain
661 # only subdirectories have no suffix).
662 my $doc_dir_num = 0; # how many directories deep we are
663 my $created_directory = 0; # have we successfully created a new directory
664 do
665 {
666 # (does this work on windows? - jmt12)
667 if ($doc_dir_num > 0)
668 {
669 $doc_dir .= '/';
670 }
671
672 # the default matching pattern grabs the next 'subdir_split_length'
673 # characters of the OID to act as the next subdirectory
674 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
675
676 # Do we count any "HASH" prefix against the split length limit?
677 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
678 {
679 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
680 }
681
682 # Note the use of 's' to both capture the next chuck of OID and to remove
683 # it from OID at the same time
684 if ($doc_dir_rest =~ s/$pattern//i)
685 {
686 $doc_dir .= $1;
687 $doc_dir_num++;
688
689 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
690 if(!FileUtils::directoryExists($full_doc_dir))
691 {
692 &FileUtils::makeAllDirectories($full_doc_dir);
693 $created_directory = 1;
694 }
695
696 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
697 ###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
698 ###rint STDERR " - rest: $doc_dir_rest\n";
699 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
700 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
701 }
702 }
703 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
704
705 # not unique yet? Add on an incremental suffix until we are unique
706 my $i = 1;
707 my $doc_dir_base = $doc_dir;
708 while ($created_directory == 0)
709 {
710 $doc_dir = $doc_dir_base . '-' . $i;
711 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
712 $i++;
713 }
714
715 # in theory this should never happen
716 if (!$created_directory)
717 {
718 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
719 }
720
721 return $doc_dir . '.dir';
722}
723## get_new_doc_dir()
724
725
726sub process_assoc_files {
727 my $self = shift (@_);
728 my ($doc_obj, $doc_dir, $handle) = @_;
729
730 my $outhandle = $self->{'output_handle'};
731
732 my $output_dir = $self->get_output_dir();
733 return if (!defined $output_dir);
734
735 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
736
737 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
738 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
739
740 my @assoc_files = ();
741 my $filename;;
742
743 my $source_filename = $doc_obj->get_source_filename();
744
745 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
746
747 if (defined $collect_dir) {
748 my $dirsep_regexp = &util::get_os_dirsep();
749
750 if ($collect_dir !~ /$dirsep_regexp$/) {
751 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
752 }
753
754 # This test is never going to fail on Windows -- is this a problem?
755
756 if ($source_filename !~ /^$dirsep_regexp/) {
757 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
758 }
759 }
760
761
762 # set the assocfile path (even if we have no assoc files - need this for lucene)
763 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
764 "assocfilepath",
765 "$doc_dir");
766 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
767 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
768 $dir = "" unless defined $dir;
769
770 my $utf8_real_filename = $assoc_file_rec->[0];
771
772 # for some reasons the image associate file has / before the full path
773 $utf8_real_filename =~ s/^\\(.*)/$1/i;
774
775## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
776 my $real_filename = $utf8_real_filename;
777 $real_filename = &util::downgrade_if_dos_filename($real_filename);
778
779 if (&FileUtils::fileExists($real_filename)) {
780
781 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
782
783 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
784
785 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
786 "gsdlassocfile",
787 "$afile:$assoc_file_rec->[2]:$dir");
788 } elsif ($self->{'verbosity'} > 1) {
789 print $outhandle "BasePlugout::process couldn't copy the associated file " .
790 "$real_filename to $afile\n";
791 }
792 }
793}
794
795
796sub process_metafiles_metadata
797{
798 my $self = shift (@_);
799 my ($doc_obj) = @_;
800
801 my $top_section = $doc_obj->get_top_section();
802 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
803
804 foreach my $metafile_pair (@$metafiles) {
805 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
806
807 $doc_obj->metadata_file($full_metafile,$metafile);
808 }
809
810 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
811}
812
813sub archiveinf_files_to_field
814{
815 my $self = shift(@_);
816 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
817
818 foreach my $file_rec (@$files) {
819 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
820 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
821 # for some reasons the image associate file has / before the full path
822 $real_filename =~ s/^\\(.*)/$1/i;
823
824 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
825
826 if (&FileUtils::fileExists($raw_filename)) {
827
828# if (defined $collect_dir) {
829# my $collect_dir_re_safe = $collect_dir;
830# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
831# $collect_dir_re_safe =~ s/\./\\./g;##
832
833# $real_filename =~ s/^$collect_dir_re_safe//;
834# }
835
836 if (defined $reverse_lookups) {
837 $reverse_lookups->{$real_filename} = 1;
838 }
839
840 if($field =~ m@assoc-file|src-file|meta-file@) {
841 $raw_filename = &util::abspath_to_placeholders($raw_filename);
842 }
843
844### push(@{$oid_files->{$field}},$full_file);
845 push(@{$oid_files->{$field}},$raw_filename);
846 }
847 else {
848 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
849 }
850 }
851}
852
853sub archiveinf_db
854{
855 my $self = shift (@_);
856 my ($doc_obj) = @_;
857
858 my $verbosity = $self->{'verbosity'};
859
860 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
861 if (defined $collect_dir) {
862 my $dirsep_regexp = &util::get_os_dirsep();
863
864 if ($collect_dir !~ /$dirsep_regexp$/) {
865 # ensure there is a slash at the end
866 $collect_dir .= &util::get_dirsep();
867 }
868 }
869
870 my $oid = $doc_obj->get_OID();
871 my $source_filename = $doc_obj->get_unmodified_source_filename();
872 my $working_info = $self->{'output_info'};
873 my $doc_info = $working_info->get_info($oid);
874
875 my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
876 # doc_file is the path to the archive doc.xml. Make sure it has unix
877 # slashes, then if the collection is copied to linux, it can be built without reimport
878 $doc_file =~ s/\\/\//g;
879 my $oid_files = { 'doc-file' => $doc_file,
880 'index-status' => $index_status,
881 'src-file' => $source_filename,
882 'sort-meta' => $sortmeta,
883 'assoc-file' => [],
884 'meta-file' => [] };
885 if (defined $group_position) {
886 $oid_files->{'group-position'} = $group_position;
887 }
888 my $reverse_lookups = { $source_filename => "1" };
889
890
891 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
892 $collect_dir,$oid_files,$reverse_lookups);
893
894
895 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
896 $collect_dir,$oid_files);
897
898 # Get the infodbtype value for this collection from the arcinfo object
899 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
900 my $output_dir = $self->{'output_dir'};
901
902 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
903
904 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
905
906 if (!$self->{'no_rss'})
907 {
908 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
909 my $top_section = $doc_obj->get_top_section();
910
911 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
912 my $dc_title = join("; ", @$dc_titles);
913
914 if ($oid_files->{'index-status'} eq "R") {
915 $dc_title .= " (Updated)";
916 }
917
918 my $rss_entry = "<item>\n";
919 $rss_entry .= " <title>$dc_title</title>\n";
920 if($ENV{'GSDL3SRCHOME'}) {
921 $rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
922 } else {
923 $rss_entry .= " <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
924 }
925 $rss_entry .= "</item>";
926
927 if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
928 {
929 my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
930 my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
931 &dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
932 &dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
933 }
934 else
935 {
936 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
937 my $rss_fh;
938 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
939 {
940 print $rss_fh $rss_entry . "\n";
941 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
942 }
943 else
944 {
945 print STDERR "**** Failed to open $rss_filename\n$!\n";
946 }
947 }
948 }
949 }
950
951 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
952 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
953 $oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
954 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
955 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
956 if (defined $oid_files->{'group-position'}) {
957 $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
958 }
959
960 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
961 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
962 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
963
964 foreach my $rl (keys %$reverse_lookups) {
965 $working_info->add_reverseinfo($rl,$oid);
966 }
967
968 # meta files not set in reverese entry, but need to set the metadata flag
969 if (defined $doc_obj->get_meta_files()) {
970 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
971 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
972 $working_info->set_meta_file_flag($full_file);
973 }
974 }
975}
976
977
978sub set_sortmeta {
979 my $self = shift (@_);
980 my ($sortmeta, $removeprefix, $removesuffix) = @_;
981
982 $self->{'sortmeta'} = $sortmeta;
983 if (defined ($removeprefix) && $removeprefix ) {
984 $removeprefix =~ s/^\^//; # don't need a leading ^
985 $self->{'removeprefix'} = $removeprefix;
986 }
987 if (defined ($removesuffix) && $removesuffix) {
988 $removesuffix =~ s/\$$//; # don't need a trailing $
989 $self->{'removesuffix'} = $removesuffix;
990 }
991}
992
993
994
995sub open_xslt_pipe
996{
997 my $self = shift @_;
998 my ($output_file_name, $xslt_file)=@_;
999
1000 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1001
1002 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1003
1004 my $mapping_file_path = "";
1005
1006 if ($ENV{'GSDLOS'} eq "windows"){
1007 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1008 # this file:/// bit didn't work for me on windows XP
1009 #$xslt_file = "\"file:///".$xslt_file."\"";
1010 #$mapping_file_path = "\"file:///";
1011 }
1012 else{
1013 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1014 }
1015
1016
1017 $java_class_path = "\"".$java_class_path."\"";
1018
1019 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1020
1021 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1022 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1023 $cmd .= "-m $mapping_file_path";
1024 }
1025
1026 open(*XMLWRITER, $cmd)
1027 or die "can't open pipe to xslt: $!";
1028
1029
1030 $self->{'xslt_writer'} = *XMLWRITER;
1031
1032 print XMLWRITER "<?DocStart?>\n";
1033 print XMLWRITER "$output_file_name\n";
1034
1035
1036 }
1037
1038
1039sub close_xslt_pipe
1040{
1041 my $self = shift @_;
1042
1043
1044 return unless defined $self->{'xslt_writer'} ;
1045
1046 my $xsltwriter = $self->{'xslt_writer'};
1047
1048 print $xsltwriter "<?DocEnd?>\n";
1049 close($xsltwriter);
1050
1051 undef $self->{'xslt_writer'};
1052
1053}
1054
1055
1056
1057#the subclass should implement this method if is_group method could return 1.
1058sub close_group_output{
1059 my $self = shift (@_);
1060}
1061
1062sub is_group {
1063 my $self = shift (@_);
1064 return 0;
1065}
1066
1067my $dc_set = { Title => 1,
1068 Creator => 1,
1069 Subject => 1,
1070 Description => 1,
1071 Publisher => 1,
1072 Contributor => 1,
1073 Date => 1,
1074 Type => 1,
1075 Format => 1,
1076 Identifier => 1,
1077 Source => 1,
1078 Language => 1,
1079 Relation => 1,
1080 Coverage => 1,
1081 Rights => 1};
1082
1083
1084# returns an XML representation of the dublin core metadata
1085# if dc meta is not found, try ex meta
1086# This method is not used by the DSpacePlugout, which has its
1087# own method to save its dc metadata
1088sub get_dc_metadata {
1089 my $self = shift(@_);
1090 my ($doc_obj, $section, $version) = @_;
1091
1092 # build up string of dublin core metadata
1093 $section="" unless defined $section;
1094
1095 my $section_ptr = $doc_obj->_lookup_section($section);
1096 return "" unless defined $section_ptr;
1097
1098
1099 my $explicit_dc = {};
1100 my $explicit_ex_dc = {};
1101 my $explicit_ex = {};
1102
1103 my $all_text="";
1104
1105 # We want high quality dc metadata to go in first, so we store all the
1106 # assigned dc.* values first. Then, for all those dc metadata names in
1107 # the official dc set that are as yet unassigned, we look to see whether
1108 # embedded ex.dc.* metadata has defined some values for them. If not,
1109 # then for the same missing dc metadata names, we look in ex metadata.
1110
1111 foreach my $data (@{$section_ptr->{'metadata'}}){
1112 my $escaped_value = &docprint::escape_text($data->[1]);
1113 if ($data->[0]=~ m/^dc\./) {
1114 $data->[0] =~ tr/[A-Z]/[a-z]/;
1115
1116 $data->[0] =~ m/^dc\.(.*)/;
1117 my $dc_element = $1;
1118
1119 if (!defined $explicit_dc->{$dc_element}) {
1120 $explicit_dc->{$dc_element} = [];
1121 }
1122 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1123
1124 if (defined $version && ($version eq "oai_dc")) {
1125 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1126 }
1127 else {
1128 # qualifier???
1129 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1130 }
1131
1132 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1133 $data->[0] =~ m/^ex\.dc\.(.*)/;
1134 my $ex_dc_element = $1;
1135 my $lc_ex_dc_element = lc($ex_dc_element);
1136
1137 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1138 if (defined $dc_set->{$ex_dc_element}) {
1139 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1140 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1141 }
1142 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1143 }
1144 }
1145 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1146 $data->[0] =~ m/^(ex\.)?(.*)/;
1147 my $ex_element = $2;
1148 my $lc_ex_element = lc($ex_element);
1149
1150 if (defined $dc_set->{$ex_element}) {
1151 if (!defined $explicit_ex->{$lc_ex_element}) {
1152 $explicit_ex->{$lc_ex_element} = [];
1153 }
1154 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1155 }
1156 }
1157 }
1158
1159 # go through dc_set and for any element *not* defined in explicit_dc
1160 # that does exist in explicit_ex, add it in as metadata
1161 foreach my $k ( keys %$dc_set ) {
1162 my $lc_k = lc($k);
1163
1164 if (!defined $explicit_dc->{$lc_k}) {
1165 # try to find if ex.dc.* defines this dc.* meta,
1166 # if not, then look for whether there's an ex.* equivalent
1167
1168 if (defined $explicit_ex_dc->{$lc_k}) {
1169 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1170 my $dc_element = $lc_k;
1171 my $escaped_value = $v;
1172
1173 if (defined $version && ($version eq "oai_dc")) {
1174 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1175 }
1176 else {
1177 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1178 }
1179 }
1180 } elsif (defined $explicit_ex->{$lc_k}) {
1181 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1182 my $dc_element = $lc_k;
1183 my $escaped_value = $v;
1184
1185 if (defined $version && ($version eq "oai_dc")) {
1186 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1187 }
1188 else {
1189 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1190 }
1191 }
1192 }
1193 }
1194 }
1195
1196 if ($all_text eq "") {
1197 $all_text .= " There is no Dublin Core metatdata in this document\n";
1198 }
1199 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1200
1201 return $all_text;
1202}
1203
1204# Build up dublin_core metadata. Priority given to dc.* over ex.*
1205# This method was apparently added by Jeffrey and committed by Shaoqun.
1206# But we don't know why it was added, so not using it anymore.
1207sub new_get_dc_metadata {
1208
1209 my $self = shift(@_);
1210 my ($doc_obj, $section, $version) = @_;
1211
1212 # build up string of dublin core metadata
1213 $section="" unless defined $section;
1214
1215 my $section_ptr=$doc_obj->_lookup_section($section);
1216 return "" unless defined $section_ptr;
1217
1218 my $all_text = "";
1219 foreach my $data (@{$section_ptr->{'metadata'}}){
1220 my $escaped_value = &docprint::escape_text($data->[1]);
1221 my $dc_element = $data->[0];
1222
1223 my @array = split('\.',$dc_element);
1224 my ($type,$name);
1225
1226 if(defined $array[1])
1227 {
1228 $type = $array[0];
1229 $name = $array[1];
1230 }
1231 else
1232 {
1233 $type = "ex";
1234 $name = $array[0];
1235 }
1236
1237 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1238 }
1239 return $all_text;
1240}
1241
1242
12431;
Note: See TracBrowser for help on using the repository browser.