source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 23824

Last change on this file since 23824 was 23824, checked in by sjm84, 13 years ago

Phase one of commiting the files changed to extend the DSpace exporting capabilities to include more than just dublin core metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 29.6 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45 { 'name' => "group_size",
46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'deft' => "",
61 'hiddengli' => "no"},
62 { 'name' => "output_handle",
63 'desc' => "{BasPlugout.output_handle}",
64 'type' => "string",
65 'deft' => 'STDERR',
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "verbosity",
69 'desc' => "{BasPlugout.verbosity}",
70 'type' => "int",
71 'deft' => "0",
72 'reqd' => "no",
73 'hiddengli' => "no"},
74 { 'name' => "gzip_output",
75 'desc' => "{BasPlugout.gzip_output}",
76 'type' => "flag",
77 'reqd' => "no",
78 'hiddengli' => "no"},
79 { 'name' => "debug",
80 'desc' => "{BasPlugout.debug}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes"}
84];
85
86my $options = { 'name' => "BasePlugout",
87 'desc' => "{BasPlugout.desc}",
88 'abstract' => "yes",
89 'inherits' => "no",
90 'args' => $arguments};
91
92sub new
93{
94 my $class = shift (@_);
95
96 my ($plugoutlist,$args,$hashArgOptLists) = @_;
97 push(@$plugoutlist, $class);
98
99 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
103
104 my $self = {};
105 $self->{'plugout_type'} = $class;
106 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107 $self->{"info_only"} = 0;
108
109 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110 # the args, just return the object.
111 foreach my $strArg (@{$args})
112 {
113 if(defined $strArg && $strArg eq "-gsdlinfo")
114 {
115 $self->{"info_only"} = 1;
116 return bless $self, $class;
117 }
118 }
119
120 delete $self->{"info_only"};
121
122 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123 {
124 my $classTempClass = bless $self, $class;
125 print STDERR "<BadPlugout d=$plugout_name>\n";
126 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127 $classTempClass->print_txt_usage(""); # Use default resource bundle
128 die "\n";
129 }
130
131
132 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
133 {
134 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135 if (!defined $full_file_path) {
136 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137 die "\n";
138 }
139 $self->{'xslt_file'} = $full_file_path;
140 }
141
142 $self->{'gs_count'} = 0;
143
144 $self->{'keep_import_structure'} = 0;
145
146 return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152 my $self = shift(@_);
153 my $header = shift(@_);
154 my $high_level_information_only = shift(@_);
155
156 # XML output is always in UTF-8
157 gsprintf::output_strings_in_UTF8;
158
159 if ($header) {
160 &PrintUsage::print_xml_header("plugout");
161 }
162 $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168 my $self = shift(@_);
169 my $high_level_information_only = shift(@_);
170
171 my $optionlistref = $self->{'option_list'};
172 my @optionlist = @$optionlistref;
173 my $plugoutoptions = shift(@$optionlistref);
174 return if (!defined($plugoutoptions));
175
176 gsprintf(STDERR, "<PlugoutInfo>\n");
177 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
178 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179 $desc =~ s/</&amp;lt;/g; # doubly escaped
180 $desc =~ s/>/&amp;gt;/g;
181 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184 unless (defined($high_level_information_only)) {
185 gsprintf(STDERR, " <Arguments>\n");
186 if (defined($plugoutoptions->{'args'})) {
187 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188 }
189 gsprintf(STDERR, " </Arguments>\n");
190
191 # Recurse up the plugout hierarchy
192 $self->print_xml();
193 }
194 gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200 my $self = shift(@_);
201
202 # Print the usage message for a plugout (recursively)
203 my $descoffset = $self->determine_description_offset(0);
204 $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209 my $self = shift(@_);
210 my $maxoffset = shift(@_);
211
212 my $optionlistref = $self->{'option_list'};
213 my @optionlist = @$optionlistref;
214 my $plugoutoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($plugoutoptions));
216
217 # Find the length of the longest option string of this download
218 my $plugoutargs = $plugoutoptions->{'args'};
219 if (defined($plugoutargs)) {
220 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the download hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235 my $self = shift(@_);
236 my $descoffset = shift(@_);
237 my $isleafclass = shift(@_);
238
239 my $optionlistref = $self->{'option_list'};
240 my @optionlist = @$optionlistref;
241 my $plugoutoptions = shift(@$optionlistref);
242 return if (!defined($plugoutoptions));
243
244 my $plugoutname = $plugoutoptions->{'name'};
245 my $plugoutargs = $plugoutoptions->{'args'};
246 my $plugoutdesc = $plugoutoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugoutdesc)) {
251 gsprintf(STDERR, "$plugoutdesc\n\n");
252 }
253 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254 }
255
256 # Display the download options, if there are some
257 if (defined($plugoutargs)) {
258 # Calculate the column offset of the option descriptions
259 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266 }
267
268 # Display the download options
269 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270 }
271
272 # Recurse up the download hierarchy
273 $self->print_plugout_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280 my ($strFunctionName,$strError) = @_;
281 {
282 print "Error occoured in BasePlugout.pm\n".
283 "In Function: ".$strFunctionName."\n".
284 "Error Message: ".$strError."\n";
285 exit(-1);
286 }
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291 my $self = shift (@_);
292 my ($type, $metadata) = @_;
293
294 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295 $self->{'OIDtype'} = $type;
296 } else {
297 $self->{'OIDtype'} = "hash";
298 }
299 if ($type =~ /^assigned$/) {
300 if (defined $metadata) {
301 $self->{'OIDmetadata'} = $metadata;
302 } else {
303 $self->{'OIDmetadata'} = "dc.Identifier";
304 }
305 }
306}
307
308sub set_output_dir
309{
310 my $self = shift @_;
311 my ($output_dir) = @_;
312
313 $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318 my $self = shift @_;
319 my ($output_dir) = @_;
320
321 $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326 my $self = shift (@_);
327
328 return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333 my $self = shift (@_);
334
335 return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340 my $self = shift (@_);
341
342 return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348 my $self = shift (@_);
349
350 my ($output_file_name) = @_;
351
352 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
353
354 return *OUTPUT;
355}
356
357sub release_output_handler
358{
359 my $self = shift (@_);
360 my ($outhandler) = @_;
361
362 close($outhandler);
363
364}
365
366sub output_xml_header {
367 my $self = shift (@_);
368 my ($handle,$docroot,$nondoctype) = @_;
369
370
371 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372
373 #For Dspace must be UTF in lower case
374 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
375
376 if (!defined $nondoctype){
377 my $doctype = (defined $docroot) ? $docroot : "Section";
378
379 # Used to be '<!DOCTYPE Archive SYSTEM ...'
380
381 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
382 }
383
384 print $handle "<$docroot>\n" if defined $docroot;
385}
386
387sub output_xml_footer {
388 my $self = shift (@_);
389 my ($handle,$docroot) = @_;
390 print $handle "</$docroot>\n" if defined $docroot;
391}
392
393
394sub output_general_xml_header
395{
396 my $self = shift (@_);
397 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
398
399 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
400
401 if (defined $opt_dtd) {
402 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
403 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
404 }
405
406 if (defined $docroot) {
407 my $full_docroot = $docroot;
408 if (defined $opt_attributes) {
409 $full_docroot .= " $opt_attributes";
410 }
411
412 print $handle "<$full_docroot>\n"
413 }
414}
415
416sub output_general_xml_footer
417{
418 output_xml_footer(@_);
419}
420
421
422sub process {
423 my $self = shift (@_);
424 my ($doc_obj) = @_;
425
426 $doc_obj->set_lastmodified();
427
428 if ($self->{'group_size'} > 1) {
429 $self->group_process ($doc_obj);
430 return;
431 }
432
433 my $OID = $doc_obj->get_OID();
434 $OID = "NULL" unless defined $OID;
435
436 my $top_section = $doc_obj->get_top_section();
437
438 #get document's directory
439 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
440
441 my $output_info = $self->{'output_info'};
442 return if (!defined $output_info);
443
444 ##############################
445 # call subclass' saveas method
446 ##############################
447 $self->saveas($doc_obj,$doc_dir);
448 $self->archiveinf_db($doc_obj,$doc_dir);
449
450}
451
452sub store_output_info_reference {
453 my $self = shift (@_);
454 my ($doc_obj) = @_;
455
456 my $output_info = $self->{'output_info'};
457 my $metaname = $self->{'sortmeta'};
458 if (!defined $metaname || $metaname !~ /\S/) {
459 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
460 return;
461 }
462
463 my $metadata = "";
464 my $top_section = $doc_obj->get_top_section();
465
466 my @commameta_list = split(/,/, $metaname);
467 foreach my $cmn (@commameta_list) {
468 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
469 if ($meta) {
470 # do remove prefix/suffix - this will apply to all values
471 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
472 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
473 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
474 $metadata .= $meta if ($meta);
475 }
476 }
477
478 # store reference in the output_info
479 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
480
481}
482
483sub group_process {
484
485 my $self = shift (@_);
486 my ($doc_obj) = @_;
487
488 my $OID = $doc_obj->get_OID();
489 $OID = "NULL" unless defined $OID;
490
491 my $groupsize = $self->{'group_size'};
492 my $gs_count = $self->{'gs_count'};
493 my $open_new_file = (($gs_count % $groupsize)==0);
494 my $outhandle = $self->{'output_handle'};
495
496 # opening a new file, or document has assoicated files => directory needed
497 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
498
499 # The directory the archive file (doc.xml) and all associated files
500 # should end up in
501 my $doc_dir;
502 # If we've determined its time for a new file, open it now
503 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
504 {
505 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
506 # only if opening new file
507 my $output_dir = $self->get_output_dir();
508 &util::mk_all_dir ($output_dir) unless -e $output_dir;
509 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
510 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
511
512 if ($gs_count>0)
513 {
514 return if (!$self->close_file_output());
515 }
516
517 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
518
519
520 $self->{'gs_filename'} = $doc_file;
521 $self->{'short_doc_file'} = $short_doc_file;
522 $self->{'gs_OID'} = $OID;
523 $self->{'gs_doc_dir'} = $doc_dir;
524
525 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
526 }
527 # Otherwise load the same archive document directory used last time
528 else
529 {
530 $doc_dir = $self->{'gs_doc_dir'};
531 }
532
533 # copy all the associated files, add this information as metadata
534 # to the document
535 print $outhandle "Writing associated files to $doc_dir\n";
536 $self->process_assoc_files ($doc_obj, $doc_dir);
537
538 # look up 'gsdlmetafile' metadata and store that information
539 # explicitly in $doc_obj
540 $self->process_metafiles_metadata ($doc_obj);
541 }
542
543 # save this document
544 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
545 print GROUPPROCESS $section_text;
546
547 $self->{'gs_count'}++;
548}
549
550
551sub saveas {
552 my $self = shift (@_);
553
554 die "Basplug::saveas function must be implemented in sub classes\n";
555}
556
557sub get_doc_dir {
558 my $self = shift (@_);
559 my ($OID, $source_filename) = @_;
560
561 my $working_dir = $self->get_output_dir();
562 my $working_info = $self->{'output_info'};
563 return if (!defined $working_info);
564
565 my $doc_info = $working_info->get_info($OID);
566 my $doc_dir = '';
567
568 if (defined $doc_info && scalar(@$doc_info) >= 1)
569 {
570 # This OID already has an archives directory, so use it again
571 $doc_dir = $doc_info->[0];
572 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
573 }
574 elsif ($self->{'keep_import_structure'})
575 {
576 $source_filename = &File::Basename::dirname($source_filename);
577 $source_filename =~ s/[\\\/]+/\//g;
578 $source_filename =~ s/\/$//;
579
580 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
581 }
582
583 # We have to use a new archives directory for this document
584 if ($doc_dir eq "")
585 {
586 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
587 }
588
589 if (!defined $self->{'group'} || !$self->{'group'}){
590 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
591 }
592
593 return $doc_dir;
594}
595
596sub get_new_doc_dir{
597 my $self = shift (@_);
598 my($working_info,$working_dir,$OID) = @_;
599
600
601 my $doc_dir = "";
602 my $doc_dir_rest = $OID;
603
604 # remove any \ and / from the OID
605 $doc_dir_rest =~ s/[\\\/]//g;
606
607 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
608 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
609
610 my $doc_dir_num = 0;
611
612 do {
613 $doc_dir .= "/" if $doc_dir_num > 0;
614 if ($doc_dir_rest =~ s/^(.{1,8})//) {
615 $doc_dir .= $1;
616 $doc_dir_num++;
617 }
618 } while ($doc_dir_rest ne "" &&
619 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
620 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
621 my $i = 1;
622 my $doc_dir_base = $doc_dir;
623 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
624 $doc_dir = "$doc_dir_base-$i";
625 $i++;
626 }
627
628 return "$doc_dir.dir";
629}
630
631sub process_assoc_files {
632 my $self = shift (@_);
633 my ($doc_obj, $doc_dir, $handle) = @_;
634
635 my $outhandle = $self->{'output_handle'};
636
637 my $output_dir = $self->get_output_dir();
638 return if (!defined $output_dir);
639
640 &util::mk_all_dir ($output_dir) unless -e $output_dir;
641
642 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
643 &util::mk_all_dir ($working_dir) unless -e $working_dir;
644
645 my @assoc_files = ();
646 my $filename;;
647
648 my $source_filename = $doc_obj->get_source_filename();
649
650 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
651
652 if (defined $collect_dir) {
653 my $dirsep_regexp = &util::get_os_dirsep();
654
655 if ($collect_dir !~ /$dirsep_regexp$/) {
656 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
657 }
658
659 # This test is never going to fail on Windows -- is this a problem?
660
661 if ($source_filename !~ /^$dirsep_regexp/) {
662 $source_filename = &util::filename_cat($collect_dir, $source_filename);
663 }
664 }
665
666
667 # set the assocfile path (even if we have no assoc files - need this for lucene)
668 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
669 "assocfilepath",
670 "$doc_dir");
671 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
672 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
673 $dir = "" unless defined $dir;
674
675 my $utf8_real_filename = $assoc_file_rec->[0];
676
677 # for some reasons the image associate file has / before the full path
678 $utf8_real_filename =~ s/^\\(.*)/$1/i;
679
680## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
681 my $real_filename = $utf8_real_filename;
682 $real_filename = &util::downgrade_if_dos_filename($real_filename);
683
684 if (-e $real_filename) {
685
686 $filename = &util::filename_cat($working_dir, $afile);
687
688 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
689
690 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
691 "gsdlassocfile",
692 "$afile:$assoc_file_rec->[2]:$dir");
693 } elsif ($self->{'verbosity'} > 1) {
694 print $outhandle "BasePlugout::process couldn't copy the associated file " .
695 "$real_filename to $afile\n";
696 }
697 }
698}
699
700
701sub process_metafiles_metadata
702{
703 my $self = shift (@_);
704 my ($doc_obj) = @_;
705
706 my $top_section = $doc_obj->get_top_section();
707 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
708
709 foreach my $metafile_pair (@$metafiles) {
710 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
711
712 $doc_obj->metadata_file($full_metafile,$metafile);
713 }
714
715 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
716}
717
718sub archiveinf_files_to_field
719{
720 my $self = shift(@_);
721 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
722
723 foreach my $file_rec (@$files) {
724 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
725 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
726 # for some reasons the image associate file has / before the full path
727 $real_filename =~ s/^\\(.*)/$1/i;
728
729 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
730
731 if (-e $raw_filename) {
732
733# if (defined $collect_dir) {
734# my $collect_dir_re_safe = $collect_dir;
735# $collect_dir_re_safe =~ s/\\/\\\\/g;
736# $collect_dir_re_safe =~ s/\./\\./g;##
737
738# $real_filename =~ s/^$collect_dir_re_safe//;
739# }
740
741 if (defined $reverse_lookups) {
742 $reverse_lookups->{$real_filename} = 1;
743 }
744### push(@{$oid_files->{$field}},$full_file);
745 push(@{$oid_files->{$field}},$raw_filename);
746 }
747 else {
748 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
749 }
750 }
751}
752
753sub archiveinf_db
754{
755 my $self = shift (@_);
756 my ($doc_obj) = @_;
757
758 my $verbosity = $self->{'verbosity'};
759
760 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
761 if (defined $collect_dir) {
762 my $dirsep_regexp = &util::get_os_dirsep();
763
764 if ($collect_dir !~ /$dirsep_regexp$/) {
765 # ensure there is a slash at the end
766 $collect_dir .= &util::get_dirsep();
767 }
768 }
769
770 my $oid = $doc_obj->get_OID();
771 my $source_filename = $doc_obj->get_unmodified_source_filename();
772 my $working_info = $self->{'output_info'};
773 my $doc_info = $working_info->get_info($oid);
774
775 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
776 # doc_file is the path to the archive doc.xml. Make sure it has unix
777 # slashes, then if the collection is copied to linux, it can be built without reimport
778 $doc_file =~ s/\\/\//g;
779 my $oid_files = { 'doc-file' => $doc_file,
780 'index-status' => $index_status,
781 'src-file' => $source_filename,
782 'sort-meta' => $sortmeta,
783 'assoc-file' => [],
784 'meta-file' => [] };
785
786 my $reverse_lookups = { $source_filename => "1" };
787
788
789 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
790 $collect_dir,$oid_files,$reverse_lookups);
791
792
793 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
794 $collect_dir,$oid_files);
795
796 # Get the infodbtype value for this collection from the arcinfo object
797 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
798 my $output_dir = $self->{'output_dir'};
799
800 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
801 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
802
803 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
804
805 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
806 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
807 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
808 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
809
810 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
811 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
812 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
813
814 foreach my $rl (keys %$reverse_lookups) {
815 $working_info->add_reverseinfo($rl,$oid);
816 }
817
818 # meta files not set in reverese entry, but need to set the metadata flag
819 if (defined $doc_obj->get_meta_files()) {
820 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
821 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
822 $working_info->set_meta_file_flag($full_file);
823 }
824 }
825}
826
827
828sub set_sortmeta {
829 my $self = shift (@_);
830 my ($sortmeta, $removeprefix, $removesuffix) = @_;
831
832 $self->{'sortmeta'} = $sortmeta;
833 if (defined ($removeprefix) && $removeprefix ) {
834 $removeprefix =~ s/^\^//; # don't need a leading ^
835 $self->{'removeprefix'} = $removeprefix;
836 }
837 if (defined ($removesuffix) && $removesuffix) {
838 $removesuffix =~ s/\$$//; # don't need a trailing $
839 $self->{'removesuffix'} = $removesuffix;
840 }
841}
842
843sub open_xslt_pipe
844{
845 my $self = shift @_;
846 my ($output_file_name, $xslt_file)=@_;
847
848 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
849
850 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
851
852 my $mapping_file_path = "";
853
854 if ($ENV{'GSDLOS'} eq "windows"){
855 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
856 # this file:/// bit didn't work for me on windows XP
857 #$xslt_file = "\"file:///".$xslt_file."\"";
858 #$mapping_file_path = "\"file:///";
859 }
860 else{
861 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
862 }
863
864
865 $java_class_path = "\"".$java_class_path."\"";
866
867 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
868
869 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
870 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
871 $cmd .= "-m $mapping_file_path";
872 }
873
874 open(*XMLWRITER, $cmd)
875 or die "can't open pipe to xslt: $!";
876
877
878 $self->{'xslt_writer'} = *XMLWRITER;
879
880 print XMLWRITER "<?DocStart?>\n";
881 print XMLWRITER "$output_file_name\n";
882
883
884 }
885
886
887sub close_xslt_pipe
888{
889 my $self = shift @_;
890
891
892 return unless defined $self->{'xslt_writer'} ;
893
894 my $xsltwriter = $self->{'xslt_writer'};
895
896 print $xsltwriter "<?DocEnd?>\n";
897 close($xsltwriter);
898
899 undef $self->{'xslt_writer'};
900
901}
902
903sub close_file_output
904{
905 my ($self) = @_;
906
907 # make sure that the handle has been opened - it won't be if we failed
908 # to import any documents...
909 if (defined(fileno(GROUPPROCESS))) {
910 $self->output_xml_footer('GROUPPROCESS','Archive');
911 close GROUPPROCESS;
912 }
913
914 my $OID = $self->{'gs_OID'};
915 my $short_doc_file = $self->{'short_doc_file'};
916
917 if ($self->{'gzip'}) {
918 my $doc_file = $self->{'gs_filename'};
919 `gzip $doc_file`;
920 $doc_file .= ".gz";
921 $short_doc_file .= ".gz";
922 if (!-e $doc_file) {
923 my $outhandle = $self->{'output_handle'};
924 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
925 return 0;
926 }
927 }
928
929 # store reference in output_info
930 my $output_info = $self->{'output_info'};
931 return 0 if (!defined $output_info);
932 $output_info->add_info($OID, $short_doc_file, undef, undef);
933 return 1;
934}
935
936
937#the subclass should implement this method if is_group method could return 1.
938sub close_group_output{
939 my $self = shift (@_);
940}
941
942sub is_group {
943 my $self = shift (@_);
944 return 0;
945}
946
947my $dc_set = { Title => 1,
948 Creator => 1,
949 Subject => 1,
950 Description => 1,
951 Publisher => 1,
952 Contributor => 1,
953 Date => 1,
954 Type => 1,
955 Format => 1,
956 Identifier => 1,
957 Source => 1,
958 Language => 1,
959 Relation => 1,
960 Coverage => 1,
961 Rights => 1};
962
963
964# returns an XML representation of the dublin core metadata
965# if dc meta is not found, try ex mete
966sub get_dc_metadata {
967 my $self = shift(@_);
968 my ($doc_obj, $section, $version) = @_;
969
970 # build up string of dublin core metadata
971 $section="" unless defined $section;
972
973 my $section_ptr = $doc_obj->_lookup_section($section);
974 return "" unless defined $section_ptr;
975
976
977 my $explicit_dc = {};
978 my $explicit_ex = {};
979
980 my $all_text="";
981 foreach my $data (@{$section_ptr->{'metadata'}}){
982 my $escaped_value = &docprint::escape_text($data->[1]);
983 if ($data->[0]=~ m/^dc\./) {
984 $data->[0] =~ tr/[A-Z]/[a-z]/;
985
986 $data->[0] =~ m/^dc\.(.*)/;
987 my $dc_element = $1;
988
989 if (!defined $explicit_dc->{$dc_element}) {
990 $explicit_dc->{$dc_element} = [];
991 }
992 push(@{$explicit_dc->{$dc_element}},$escaped_value);
993
994 if (defined $version && ($version eq "oai_dc")) {
995 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
996 }
997 else {
998 # qualifier???
999 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1000 }
1001
1002 }
1003 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
1004 $data->[0] =~ m/^(ex\.)?(.*)/;
1005 my $ex_element = $2;
1006 my $lc_ex_element = lc($ex_element);
1007
1008 if (defined $dc_set->{$ex_element}) {
1009 if (!defined $explicit_ex->{$lc_ex_element}) {
1010 $explicit_ex->{$lc_ex_element} = [];
1011 }
1012 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1013 }
1014 }
1015 }
1016
1017 # go through dc_set and for any element *not* defined in explicit_dc
1018 # that does exist in explicit_ex, add it in as metadata
1019 foreach my $k ( keys %$dc_set ) {
1020 my $lc_k = lc($k);
1021
1022 if (!defined $explicit_dc->{$lc_k}) {
1023 if (defined $explicit_ex->{$lc_k}) {
1024
1025 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1026 my $dc_element = $lc_k;
1027 my $escaped_value = $v;
1028
1029 if (defined $version && ($version eq "oai_dc")) {
1030 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1031 }
1032 else {
1033 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1034 }
1035
1036 }
1037 }
1038 }
1039 }
1040
1041 if ($all_text eq "") {
1042 $all_text .= " There is no Dublin Core metatdata in this document\n";
1043 }
1044 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1045
1046 return $all_text;
1047}
1048
1049# Build up dublin_core metadata. Priority given to dc.* over ex.*
1050# This method was apparently added by Jeffrey and committed by Shaoqun.
1051# But we don't know why it was added, so not using it anymore.
1052sub new_get_dc_metadata {
1053
1054 my $self = shift(@_);
1055 my ($doc_obj, $section, $version) = @_;
1056
1057 # build up string of dublin core metadata
1058 $section="" unless defined $section;
1059
1060 my $section_ptr=$doc_obj->_lookup_section($section);
1061 return "" unless defined $section_ptr;
1062
1063 my $all_text = "";
1064 foreach my $data (@{$section_ptr->{'metadata'}}){
1065 my $escaped_value = &docprint::escape_text($data->[1]);
1066 my $dc_element = $data->[0];
1067
1068 my @array = split('\.',$dc_element);
1069 my ($type,$name);
1070
1071 if(defined $array[1])
1072 {
1073 $type = $array[0];
1074 $name = $array[1];
1075 }
1076 else
1077 {
1078 $type = "ex";
1079 $name = $array[0];
1080 }
1081
1082 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1083 }
1084 return $all_text;
1085}
1086
1087
10881;
Note: See TracBrowser for help on using the repository browser.