source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 22328

Last change on this file since 22328 was 22328, checked in by kjdon, 14 years ago

store the path to doc.xml with unix / slashes so that the collection can be copied between OS and still be rebuilt without reimport. Note that full paths are used for file paths (not for doc.xml path) so incremental build won't work if collection is moved

  • Property svn:keywords set to Author Date Id Revision
File size: 28.4 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45 { 'name' => "group_size",
46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'deft' => "",
61 'hiddengli' => "no"},
62 { 'name' => "output_handle",
63 'desc' => "{BasPlugout.output_handle}",
64 'type' => "string",
65 'deft' => 'STDERR',
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "verbosity",
69 'desc' => "{BasPlugout.verbosity}",
70 'type' => "int",
71 'deft' => "0",
72 'reqd' => "no",
73 'hiddengli' => "no"},
74 { 'name' => "gzip_output",
75 'desc' => "{BasPlugout.gzip_output}",
76 'type' => "flag",
77 'reqd' => "no",
78 'hiddengli' => "no"},
79 { 'name' => "debug",
80 'desc' => "{BasPlugout.debug}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes"}
84];
85
86my $options = { 'name' => "BasePlugout",
87 'desc' => "{BasPlugout.desc}",
88 'abstract' => "yes",
89 'inherits' => "no",
90 'args' => $arguments};
91
92sub new
93{
94 my $class = shift (@_);
95
96 my ($plugoutlist,$args,$hashArgOptLists) = @_;
97 push(@$plugoutlist, $class);
98
99 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
103
104 my $self = {};
105 $self->{'plugout_type'} = $class;
106 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107 $self->{"info_only"} = 0;
108
109 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110 # the args, just return the object.
111 foreach my $strArg (@{$args})
112 {
113 if(defined $strArg && $strArg eq "-gsdlinfo")
114 {
115 $self->{"info_only"} = 1;
116 return bless $self, $class;
117 }
118 }
119
120 delete $self->{"info_only"};
121
122 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123 {
124 my $classTempClass = bless $self, $class;
125 print STDERR "<BadPlugout d=$plugout_name>\n";
126 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127 $classTempClass->print_txt_usage(""); # Use default resource bundle
128 die "\n";
129 }
130
131
132 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
133 {
134 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135 if (!defined $full_file_path) {
136 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137 die "\n";
138 }
139 $self->{'xslt_file'} = $full_file_path;
140 }
141
142 $self->{'gs_count'} = 0;
143
144 $self->{'keep_import_structure'} = 0;
145
146 return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152 my $self = shift(@_);
153 my $header = shift(@_);
154 my $high_level_information_only = shift(@_);
155
156 # XML output is always in UTF-8
157 gsprintf::output_strings_in_UTF8;
158
159 if ($header) {
160 &PrintUsage::print_xml_header("plugout");
161 }
162 $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168 my $self = shift(@_);
169 my $high_level_information_only = shift(@_);
170
171 my $optionlistref = $self->{'option_list'};
172 my @optionlist = @$optionlistref;
173 my $plugoutoptions = shift(@$optionlistref);
174 return if (!defined($plugoutoptions));
175
176 gsprintf(STDERR, "<PlugoutInfo>\n");
177 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
178 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179 $desc =~ s/</&amp;lt;/g; # doubly escaped
180 $desc =~ s/>/&amp;gt;/g;
181 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184 unless (defined($high_level_information_only)) {
185 gsprintf(STDERR, " <Arguments>\n");
186 if (defined($plugoutoptions->{'args'})) {
187 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188 }
189 gsprintf(STDERR, " </Arguments>\n");
190
191 # Recurse up the plugout hierarchy
192 $self->print_xml();
193 }
194 gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200 my $self = shift(@_);
201
202 # Print the usage message for a plugout (recursively)
203 my $descoffset = $self->determine_description_offset(0);
204 $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209 my $self = shift(@_);
210 my $maxoffset = shift(@_);
211
212 my $optionlistref = $self->{'option_list'};
213 my @optionlist = @$optionlistref;
214 my $plugoutoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($plugoutoptions));
216
217 # Find the length of the longest option string of this download
218 my $plugoutargs = $plugoutoptions->{'args'};
219 if (defined($plugoutargs)) {
220 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the download hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235 my $self = shift(@_);
236 my $descoffset = shift(@_);
237 my $isleafclass = shift(@_);
238
239 my $optionlistref = $self->{'option_list'};
240 my @optionlist = @$optionlistref;
241 my $plugoutoptions = shift(@$optionlistref);
242 return if (!defined($plugoutoptions));
243
244 my $plugoutname = $plugoutoptions->{'name'};
245 my $plugoutargs = $plugoutoptions->{'args'};
246 my $plugoutdesc = $plugoutoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugoutdesc)) {
251 gsprintf(STDERR, "$plugoutdesc\n\n");
252 }
253 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254 }
255
256 # Display the download options, if there are some
257 if (defined($plugoutargs)) {
258 # Calculate the column offset of the option descriptions
259 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266 }
267
268 # Display the download options
269 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270 }
271
272 # Recurse up the download hierarchy
273 $self->print_plugout_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280 my ($strFunctionName,$strError) = @_;
281 {
282 print "Error occoured in BasePlugout.pm\n".
283 "In Function: ".$strFunctionName."\n".
284 "Error Message: ".$strError."\n";
285 exit(-1);
286 }
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291 my $self = shift (@_);
292 my ($type, $metadata) = @_;
293
294 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295 $self->{'OIDtype'} = $type;
296 } else {
297 $self->{'OIDtype'} = "hash";
298 }
299 if ($type =~ /^assigned$/) {
300 if (defined $metadata) {
301 $self->{'OIDmetadata'} = $metadata;
302 } else {
303 $self->{'OIDmetadata'} = "dc.Identifier";
304 }
305 }
306}
307
308sub set_output_dir
309{
310 my $self = shift @_;
311 my ($output_dir) = @_;
312
313 $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318 my $self = shift @_;
319 my ($output_dir) = @_;
320
321 $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326 my $self = shift (@_);
327
328 return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333 my $self = shift (@_);
334
335 return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340 my $self = shift (@_);
341
342 return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348 my $self = shift (@_);
349
350 my ($output_file_name) = @_;
351
352 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
353
354 return *OUTPUT;
355}
356
357sub release_output_handler
358{
359 my $self = shift (@_);
360 my ($outhandler) = @_;
361
362 close($outhandler);
363
364}
365
366sub output_xml_header {
367 my $self = shift (@_);
368 my ($handle,$docroot,$nondoctype) = @_;
369
370 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
371
372 if (!defined $nondoctype){
373 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
374 }
375
376 print $handle "<$docroot>\n" if defined $docroot;
377}
378
379sub output_xml_footer {
380 my $self = shift (@_);
381 my ($handle,$docroot) = @_;
382 print $handle "</$docroot>\n" if defined $docroot;
383}
384
385sub process {
386 my $self = shift (@_);
387 my ($doc_obj) = @_;
388
389 $doc_obj->set_lastmodified();
390
391 if ($self->{'group_size'} > 1) {
392 $self->group_process ($doc_obj);
393 return;
394 }
395
396 my $OID = $doc_obj->get_OID();
397 $OID = "NULL" unless defined $OID;
398
399 my $top_section = $doc_obj->get_top_section();
400
401 #get document's directory
402 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
403
404 my $output_info = $self->{'output_info'};
405 return if (!defined $output_info);
406
407 ##############################
408 # call subclass' saveas method
409 ##############################
410 $self->saveas($doc_obj,$doc_dir);
411 $self->archiveinf_db($doc_obj,$doc_dir);
412
413}
414
415sub store_output_info_reference {
416 my $self = shift (@_);
417 my ($doc_obj) = @_;
418
419 my $output_info = $self->{'output_info'};
420 my $metaname = $self->{'sortmeta'};
421 if (!defined $metaname || $metaname !~ /\S/) {
422 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
423 return;
424 }
425
426 my $metadata = "";
427 my $top_section = $doc_obj->get_top_section();
428
429 my @commameta_list = split(/,/, $metaname);
430 foreach my $cmn (@commameta_list) {
431 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
432 if ($meta) {
433 # do remove prefix/suffix - this will apply to all values
434 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
435 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
436 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
437 $metadata .= $meta if ($meta);
438 }
439 }
440
441 # store reference in the output_info
442 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
443
444}
445
446sub group_process {
447
448 my $self = shift (@_);
449 my ($doc_obj) = @_;
450
451 my $OID = $doc_obj->get_OID();
452 $OID = "NULL" unless defined $OID;
453
454 my $groupsize = $self->{'group_size'};
455 my $gs_count = $self->{'gs_count'};
456 my $open_new_file = (($gs_count % $groupsize)==0);
457 my $outhandle = $self->{'output_handle'};
458
459 # opening a new file, or document has assoicated files => directory needed
460 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
461
462 # The directory the archive file (doc.xml) and all associated files
463 # should end up in
464 my $doc_dir;
465 # If we've determined its time for a new file, open it now
466 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
467 {
468 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
469 # only if opening new file
470 my $output_dir = $self->get_output_dir();
471 &util::mk_all_dir ($output_dir) unless -e $output_dir;
472 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
473 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
474
475 if ($gs_count>0)
476 {
477 return if (!$self->close_file_output());
478 }
479
480 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
481
482
483 $self->{'gs_filename'} = $doc_file;
484 $self->{'short_doc_file'} = $short_doc_file;
485 $self->{'gs_OID'} = $OID;
486 $self->{'gs_doc_dir'} = $doc_dir;
487
488 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
489 }
490 # Otherwise load the same archive document directory used last time
491 else
492 {
493 $doc_dir = $self->{'gs_doc_dir'};
494 }
495
496 # copy all the associated files, add this information as metadata
497 # to the document
498 print $outhandle "Writing associated files to $doc_dir\n";
499 $self->process_assoc_files ($doc_obj, $doc_dir);
500
501 # look up 'gsdlmetafile' metadata and store that information
502 # explicitly in $doc_obj
503 $self->process_metafiles_metadata ($doc_obj);
504 }
505
506 # save this document
507 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
508 print GROUPPROCESS $section_text;
509
510 $self->{'gs_count'}++;
511}
512
513
514sub saveas {
515 my $self = shift (@_);
516
517 die "Basplug::saveas function must be implemented in sub classes\n";
518}
519
520sub get_doc_dir {
521 my $self = shift (@_);
522 my ($OID, $source_filename) = @_;
523
524 my $working_dir = $self->get_output_dir();
525 my $working_info = $self->{'output_info'};
526 return if (!defined $working_info);
527
528 my $doc_info = $working_info->get_info($OID);
529 my $doc_dir = '';
530
531 if (defined $doc_info && scalar(@$doc_info) >= 1)
532 {
533 # This OID already has an archives directory, so use it again
534 $doc_dir = $doc_info->[0];
535 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
536 }
537 elsif ($self->{'keep_import_structure'})
538 {
539 $source_filename = &File::Basename::dirname($source_filename);
540 $source_filename =~ s/[\\\/]+/\//g;
541 $source_filename =~ s/\/$//;
542
543 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
544 }
545
546 # We have to use a new archives directory for this document
547 if ($doc_dir eq "")
548 {
549 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
550 }
551
552 if (!defined $self->{'group'} || !$self->{'group'}){
553 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
554 }
555
556 return $doc_dir;
557}
558
559sub get_new_doc_dir{
560 my $self = shift (@_);
561 my($working_info,$working_dir,$OID) = @_;
562
563
564 my $doc_dir = "";
565 my $doc_dir_rest = $OID;
566
567 # remove any \ and / from the OID
568 $doc_dir_rest =~ s/[\\\/]//g;
569
570 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
571 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
572
573 my $doc_dir_num = 0;
574
575 do {
576 $doc_dir .= "/" if $doc_dir_num > 0;
577 if ($doc_dir_rest =~ s/^(.{1,8})//) {
578 $doc_dir .= $1;
579 $doc_dir_num++;
580 }
581 } while ($doc_dir_rest ne "" &&
582 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
583 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
584 my $i = 1;
585 my $doc_dir_base = $doc_dir;
586 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
587 $doc_dir = "$doc_dir_base-$i";
588 $i++;
589 }
590
591 return "$doc_dir.dir";
592}
593
594sub process_assoc_files {
595 my $self = shift (@_);
596 my ($doc_obj, $doc_dir, $handle) = @_;
597
598 my $outhandle = $self->{'output_handle'};
599
600 my $output_dir = $self->get_output_dir();
601 return if (!defined $output_dir);
602
603 &util::mk_all_dir ($output_dir) unless -e $output_dir;
604
605 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
606 &util::mk_all_dir ($working_dir) unless -e $working_dir;
607
608 my @assoc_files = ();
609 my $filename;;
610
611 my $source_filename = $doc_obj->get_source_filename();
612
613 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
614
615 if (defined $collect_dir) {
616 my $dirsep_regexp = &util::get_os_dirsep();
617
618 if ($collect_dir !~ /$dirsep_regexp$/) {
619 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
620 }
621
622 # This test is never going to fail on Windows -- is this a problem?
623
624 if ($source_filename !~ /^$dirsep_regexp/) {
625 $source_filename = &util::filename_cat($collect_dir, $source_filename);
626 }
627 }
628
629
630 # set the assocfile path (even if we have no assoc files - need this for lucene)
631 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
632 "assocfilepath",
633 "$doc_dir");
634 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
635 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
636 $dir = "" unless defined $dir;
637
638
639 my $real_filename = $assoc_file_rec->[0];
640 # for some reasons the image associate file has / before the full path
641 $real_filename =~ s/^\\(.*)/$1/i;
642 if (-e $real_filename) {
643
644 $filename = &util::filename_cat($working_dir, $afile);
645
646 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
647
648 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
649 "gsdlassocfile",
650 "$afile:$assoc_file_rec->[2]:$dir");
651 } elsif ($self->{'verbosity'} > 2) {
652 print $outhandle "BasePlugout::process couldn't copy the associated file " .
653 "$real_filename to $afile\n";
654 }
655 }
656}
657
658
659sub process_metafiles_metadata
660{
661 my $self = shift (@_);
662 my ($doc_obj) = @_;
663
664 my $top_section = $doc_obj->get_top_section();
665 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
666
667 foreach my $metafile_pair (@$metafiles) {
668 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
669
670 $doc_obj->metadata_file($full_metafile,$metafile);
671 }
672
673 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
674}
675
676sub archiveinf_files_to_field
677{
678 my $self = shift(@_);
679 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
680
681 foreach my $file_rec (@$files) {
682 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
683 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
684 # for some reasons the image associate file has / before the full path
685 $real_filename =~ s/^\\(.*)/$1/i;
686 if (-e $real_filename) {
687
688# if (defined $collect_dir) {
689# my $collect_dir_re_safe = $collect_dir;
690# $collect_dir_re_safe =~ s/\\/\\\\/g;
691# $collect_dir_re_safe =~ s/\./\\./g;##
692
693# $real_filename =~ s/^$collect_dir_re_safe//;
694# }
695
696 if (defined $reverse_lookups) {
697 $reverse_lookups->{$real_filename} = 1;
698 }
699 push(@{$oid_files->{$field}},$full_file);
700 }
701 else {
702 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
703 }
704 }
705}
706
707sub archiveinf_db
708{
709 my $self = shift (@_);
710 my ($doc_obj) = @_;
711
712 my $verbosity = $self->{'verbosity'};
713
714 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
715 if (defined $collect_dir) {
716 my $dirsep_regexp = &util::get_os_dirsep();
717
718 if ($collect_dir !~ /$dirsep_regexp$/) {
719 # ensure there is a slash at the end
720 $collect_dir .= &util::get_dirsep();
721 }
722 }
723
724 my $oid = $doc_obj->get_OID();
725 my $source_filename = $doc_obj->get_unmodified_source_filename();
726 my $working_info = $self->{'output_info'};
727 my $doc_info = $working_info->get_info($oid);
728
729 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
730 # doc_file is the path to the archive doc.xml. Make sure it has unix
731 # slashes, then if the collection is copied to linux, it can be built without reimport
732 $doc_file =~ s/\\/\//g;
733 my $oid_files = { 'doc-file' => $doc_file,
734 'index-status' => $index_status,
735 'src-file' => $source_filename,
736 'sort-meta' => $sortmeta,
737 'assoc-file' => [],
738 'meta-file' => [] };
739
740 my $reverse_lookups = { $source_filename => "1" };
741
742
743 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
744 $collect_dir,$oid_files,$reverse_lookups);
745
746
747 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
748 $collect_dir,$oid_files);
749
750 # Get the infodbtype value for this collection from the arcinfo object
751 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
752 my $output_dir = $self->{'output_dir'};
753
754 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
755 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
756
757 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
758
759 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
760 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
761 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
762 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
763
764 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
765 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
766 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
767
768 foreach my $rl (keys %$reverse_lookups) {
769 $working_info->add_reverseinfo($rl,$oid);
770 }
771
772 # meta files not set in reverese entry, but need to set the metadata flag
773 if (defined $doc_obj->get_meta_files()) {
774 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
775 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
776 $working_info->set_meta_file_flag($full_file);
777 }
778 }
779}
780
781
782sub set_sortmeta {
783 my $self = shift (@_);
784 my ($sortmeta, $removeprefix, $removesuffix) = @_;
785
786 $self->{'sortmeta'} = $sortmeta;
787 if (defined ($removeprefix) && $removeprefix ) {
788 $removeprefix =~ s/^\^//; # don't need a leading ^
789 $self->{'removeprefix'} = $removeprefix;
790 }
791 if (defined ($removesuffix) && $removesuffix) {
792 $removesuffix =~ s/\$$//; # don't need a trailing $
793 $self->{'removesuffix'} = $removesuffix;
794 }
795}
796
797sub open_xslt_pipe
798{
799 my $self = shift @_;
800 my ($output_file_name, $xslt_file)=@_;
801
802 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
803
804 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
805
806 my $mapping_file_path = "";
807
808 if ($ENV{'GSDLOS'} eq "windows"){
809 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
810 # this file:/// bit didn't work for me on windows XP
811 #$xslt_file = "\"file:///".$xslt_file."\"";
812 #$mapping_file_path = "\"file:///";
813 }
814 else{
815 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
816 }
817
818
819 $java_class_path = "\"".$java_class_path."\"";
820
821 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
822
823 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
824 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
825 $cmd .= "-m $mapping_file_path";
826 }
827
828 open(*XMLWRITER, $cmd)
829 or die "can't open pipe to xslt: $!";
830
831
832 $self->{'xslt_writer'} = *XMLWRITER;
833
834 print XMLWRITER "<?DocStart?>\n";
835 print XMLWRITER "$output_file_name\n";
836
837
838 }
839
840
841sub close_xslt_pipe
842{
843 my $self = shift @_;
844
845
846 return unless defined $self->{'xslt_writer'} ;
847
848 my $xsltwriter = $self->{'xslt_writer'};
849
850 print $xsltwriter "<?DocEnd?>\n";
851 close($xsltwriter);
852
853 undef $self->{'xslt_writer'};
854
855}
856
857sub close_file_output
858{
859 my ($self) = @_;
860
861 # make sure that the handle has been opened - it won't be if we failed
862 # to import any documents...
863 if (defined(fileno(GROUPPROCESS))) {
864 $self->output_xml_footer('GROUPPROCESS','Archive');
865 close GROUPPROCESS;
866 }
867
868 my $OID = $self->{'gs_OID'};
869 my $short_doc_file = $self->{'short_doc_file'};
870
871 if ($self->{'gzip'}) {
872 my $doc_file = $self->{'gs_filename'};
873 `gzip $doc_file`;
874 $doc_file .= ".gz";
875 $short_doc_file .= ".gz";
876 if (!-e $doc_file) {
877 my $outhandle = $self->{'output_handle'};
878 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
879 return 0;
880 }
881 }
882
883 # store reference in output_info
884 my $output_info = $self->{'output_info'};
885 return 0 if (!defined $output_info);
886 $output_info->add_info($OID, $short_doc_file, undef, undef);
887 return 1;
888}
889
890
891#the subclass should implement this method if is_group method could return 1.
892sub close_group_output{
893 my $self = shift (@_);
894}
895
896sub is_group {
897 my $self = shift (@_);
898 return 0;
899}
900
901my $dc_set = { Title => 1,
902 Creator => 1,
903 Subject => 1,
904 Description => 1,
905 Publisher => 1,
906 Contributor => 1,
907 Date => 1,
908 Type => 1,
909 Format => 1,
910 Identifier => 1,
911 Source => 1,
912 Language => 1,
913 Relation => 1,
914 Coverage => 1,
915 Rights => 1};
916
917
918# returns an XML representation of the dublin core metadata
919# if dc meta is not found, try ex mete
920sub get_dc_metadata {
921 my $self = shift(@_);
922 my ($doc_obj, $section, $version) = @_;
923
924 # build up string of dublin core metadata
925 $section="" unless defined $section;
926
927 my $section_ptr = $doc_obj->_lookup_section($section);
928 return "" unless defined $section_ptr;
929
930
931 my $explicit_dc = {};
932 my $explicit_ex = {};
933
934 my $all_text="";
935 foreach my $data (@{$section_ptr->{'metadata'}}){
936 my $escaped_value = &docprint::escape_text($data->[1]);
937 if ($data->[0]=~ m/^dc\./) {
938 $data->[0] =~ tr/[A-Z]/[a-z]/;
939
940 $data->[0] =~ m/^dc\.(.*)/;
941 my $dc_element = $1;
942
943 if (!defined $explicit_dc->{$dc_element}) {
944 $explicit_dc->{$dc_element} = [];
945 }
946 push(@{$explicit_dc->{$dc_element}},$escaped_value);
947
948 if (defined $version && ($version eq "oai_dc")) {
949 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
950 }
951 else {
952 # qualifier???
953 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
954 }
955
956 }
957 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
958 $data->[0] =~ m/^(ex\.)?(.*)/;
959 my $ex_element = $2;
960 my $lc_ex_element = lc($ex_element);
961
962 if (defined $dc_set->{$ex_element}) {
963 if (!defined $explicit_ex->{$lc_ex_element}) {
964 $explicit_ex->{$lc_ex_element} = [];
965 }
966 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
967 }
968 }
969 }
970
971 # go through dc_set and for any element *not* defined in explicit_dc
972 # that does exist in explicit_ex, add it in as metadata
973 foreach my $k ( keys %$dc_set ) {
974 my $lc_k = lc($k);
975
976 if (!defined $explicit_dc->{$lc_k}) {
977 if (defined $explicit_ex->{$lc_k}) {
978
979 foreach my $v (@{$explicit_ex->{$lc_k}}) {
980 my $dc_element = $lc_k;
981 my $escaped_value = $v;
982
983 if (defined $version && ($version eq "oai_dc")) {
984 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
985 }
986 else {
987 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
988 }
989
990 }
991 }
992 }
993 }
994
995 if ($all_text eq "") {
996 $all_text .= " There is no Dublin Core metatdata in this document\n";
997 }
998 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
999
1000 return $all_text;
1001}
1002
1003# Build up dublin_core metadata. Priority given to dc.* over ex.*
1004# This method was apparently added by Jeffrey and committed by Shaoqun.
1005# But we don't know why it was added, so not using it anymore.
1006sub new_get_dc_metadata {
1007
1008 my $self = shift(@_);
1009 my ($doc_obj, $section, $version) = @_;
1010
1011 # build up string of dublin core metadata
1012 $section="" unless defined $section;
1013
1014 my $section_ptr=$doc_obj->_lookup_section($section);
1015 return "" unless defined $section_ptr;
1016
1017 my $all_text = "";
1018 foreach my $data (@{$section_ptr->{'metadata'}}){
1019 my $escaped_value = &docprint::escape_text($data->[1]);
1020 my $dc_element = $data->[0];
1021
1022 my @array = split('\.',$dc_element);
1023 my ($type,$name);
1024
1025 if(defined $array[1])
1026 {
1027 $type = $array[0];
1028 $name = $array[1];
1029 }
1030 else
1031 {
1032 $type = "ex";
1033 $name = $array[0];
1034 }
1035
1036 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1037 }
1038 return $all_text;
1039}
1040
1041
10421;
Note: See TracBrowser for help on using the repository browser.