source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 22900

Last change on this file since 22900 was 22818, checked in by davidb, 14 years ago

Tightening up (slightly) of DOCTYPE line. Previously, it was quite easy for the root name printed in the DOCTYPE line (which was hard-wired to 'Archive') to not match the root element used. Changed to something that is more consistent, but could still be better.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45 { 'name' => "group_size",
46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'deft' => "",
61 'hiddengli' => "no"},
62 { 'name' => "output_handle",
63 'desc' => "{BasPlugout.output_handle}",
64 'type' => "string",
65 'deft' => 'STDERR',
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "verbosity",
69 'desc' => "{BasPlugout.verbosity}",
70 'type' => "int",
71 'deft' => "0",
72 'reqd' => "no",
73 'hiddengli' => "no"},
74 { 'name' => "gzip_output",
75 'desc' => "{BasPlugout.gzip_output}",
76 'type' => "flag",
77 'reqd' => "no",
78 'hiddengli' => "no"},
79 { 'name' => "debug",
80 'desc' => "{BasPlugout.debug}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes"}
84];
85
86my $options = { 'name' => "BasePlugout",
87 'desc' => "{BasPlugout.desc}",
88 'abstract' => "yes",
89 'inherits' => "no",
90 'args' => $arguments};
91
92sub new
93{
94 my $class = shift (@_);
95
96 my ($plugoutlist,$args,$hashArgOptLists) = @_;
97 push(@$plugoutlist, $class);
98
99 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
103
104 my $self = {};
105 $self->{'plugout_type'} = $class;
106 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107 $self->{"info_only"} = 0;
108
109 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110 # the args, just return the object.
111 foreach my $strArg (@{$args})
112 {
113 if(defined $strArg && $strArg eq "-gsdlinfo")
114 {
115 $self->{"info_only"} = 1;
116 return bless $self, $class;
117 }
118 }
119
120 delete $self->{"info_only"};
121
122 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123 {
124 my $classTempClass = bless $self, $class;
125 print STDERR "<BadPlugout d=$plugout_name>\n";
126 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127 $classTempClass->print_txt_usage(""); # Use default resource bundle
128 die "\n";
129 }
130
131
132 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
133 {
134 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135 if (!defined $full_file_path) {
136 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137 die "\n";
138 }
139 $self->{'xslt_file'} = $full_file_path;
140 }
141
142 $self->{'gs_count'} = 0;
143
144 $self->{'keep_import_structure'} = 0;
145
146 return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152 my $self = shift(@_);
153 my $header = shift(@_);
154 my $high_level_information_only = shift(@_);
155
156 # XML output is always in UTF-8
157 gsprintf::output_strings_in_UTF8;
158
159 if ($header) {
160 &PrintUsage::print_xml_header("plugout");
161 }
162 $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168 my $self = shift(@_);
169 my $high_level_information_only = shift(@_);
170
171 my $optionlistref = $self->{'option_list'};
172 my @optionlist = @$optionlistref;
173 my $plugoutoptions = shift(@$optionlistref);
174 return if (!defined($plugoutoptions));
175
176 gsprintf(STDERR, "<PlugoutInfo>\n");
177 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
178 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179 $desc =~ s/</&amp;lt;/g; # doubly escaped
180 $desc =~ s/>/&amp;gt;/g;
181 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184 unless (defined($high_level_information_only)) {
185 gsprintf(STDERR, " <Arguments>\n");
186 if (defined($plugoutoptions->{'args'})) {
187 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188 }
189 gsprintf(STDERR, " </Arguments>\n");
190
191 # Recurse up the plugout hierarchy
192 $self->print_xml();
193 }
194 gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200 my $self = shift(@_);
201
202 # Print the usage message for a plugout (recursively)
203 my $descoffset = $self->determine_description_offset(0);
204 $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209 my $self = shift(@_);
210 my $maxoffset = shift(@_);
211
212 my $optionlistref = $self->{'option_list'};
213 my @optionlist = @$optionlistref;
214 my $plugoutoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($plugoutoptions));
216
217 # Find the length of the longest option string of this download
218 my $plugoutargs = $plugoutoptions->{'args'};
219 if (defined($plugoutargs)) {
220 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the download hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235 my $self = shift(@_);
236 my $descoffset = shift(@_);
237 my $isleafclass = shift(@_);
238
239 my $optionlistref = $self->{'option_list'};
240 my @optionlist = @$optionlistref;
241 my $plugoutoptions = shift(@$optionlistref);
242 return if (!defined($plugoutoptions));
243
244 my $plugoutname = $plugoutoptions->{'name'};
245 my $plugoutargs = $plugoutoptions->{'args'};
246 my $plugoutdesc = $plugoutoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugoutdesc)) {
251 gsprintf(STDERR, "$plugoutdesc\n\n");
252 }
253 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254 }
255
256 # Display the download options, if there are some
257 if (defined($plugoutargs)) {
258 # Calculate the column offset of the option descriptions
259 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266 }
267
268 # Display the download options
269 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270 }
271
272 # Recurse up the download hierarchy
273 $self->print_plugout_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280 my ($strFunctionName,$strError) = @_;
281 {
282 print "Error occoured in BasePlugout.pm\n".
283 "In Function: ".$strFunctionName."\n".
284 "Error Message: ".$strError."\n";
285 exit(-1);
286 }
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291 my $self = shift (@_);
292 my ($type, $metadata) = @_;
293
294 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295 $self->{'OIDtype'} = $type;
296 } else {
297 $self->{'OIDtype'} = "hash";
298 }
299 if ($type =~ /^assigned$/) {
300 if (defined $metadata) {
301 $self->{'OIDmetadata'} = $metadata;
302 } else {
303 $self->{'OIDmetadata'} = "dc.Identifier";
304 }
305 }
306}
307
308sub set_output_dir
309{
310 my $self = shift @_;
311 my ($output_dir) = @_;
312
313 $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318 my $self = shift @_;
319 my ($output_dir) = @_;
320
321 $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326 my $self = shift (@_);
327
328 return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333 my $self = shift (@_);
334
335 return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340 my $self = shift (@_);
341
342 return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348 my $self = shift (@_);
349
350 my ($output_file_name) = @_;
351
352 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
353
354 return *OUTPUT;
355}
356
357sub release_output_handler
358{
359 my $self = shift (@_);
360 my ($outhandler) = @_;
361
362 close($outhandler);
363
364}
365
366sub output_xml_header {
367 my $self = shift (@_);
368 my ($handle,$docroot,$nondoctype) = @_;
369
370 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
371
372 if (!defined $nondoctype){
373 my $doctype = (defined $docroot) ? $docroot : "Section";
374
375 # Used to be '<!DOCTYPE Archive SYSTEM ...'
376
377 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
378 }
379
380 print $handle "<$docroot>\n" if defined $docroot;
381}
382
383sub output_xml_footer {
384 my $self = shift (@_);
385 my ($handle,$docroot) = @_;
386 print $handle "</$docroot>\n" if defined $docroot;
387}
388
389sub process {
390 my $self = shift (@_);
391 my ($doc_obj) = @_;
392
393 $doc_obj->set_lastmodified();
394
395 if ($self->{'group_size'} > 1) {
396 $self->group_process ($doc_obj);
397 return;
398 }
399
400 my $OID = $doc_obj->get_OID();
401 $OID = "NULL" unless defined $OID;
402
403 my $top_section = $doc_obj->get_top_section();
404
405 #get document's directory
406 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
407
408 my $output_info = $self->{'output_info'};
409 return if (!defined $output_info);
410
411 ##############################
412 # call subclass' saveas method
413 ##############################
414 $self->saveas($doc_obj,$doc_dir);
415 $self->archiveinf_db($doc_obj,$doc_dir);
416
417}
418
419sub store_output_info_reference {
420 my $self = shift (@_);
421 my ($doc_obj) = @_;
422
423 my $output_info = $self->{'output_info'};
424 my $metaname = $self->{'sortmeta'};
425 if (!defined $metaname || $metaname !~ /\S/) {
426 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
427 return;
428 }
429
430 my $metadata = "";
431 my $top_section = $doc_obj->get_top_section();
432
433 my @commameta_list = split(/,/, $metaname);
434 foreach my $cmn (@commameta_list) {
435 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
436 if ($meta) {
437 # do remove prefix/suffix - this will apply to all values
438 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
439 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
440 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
441 $metadata .= $meta if ($meta);
442 }
443 }
444
445 # store reference in the output_info
446 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
447
448}
449
450sub group_process {
451
452 my $self = shift (@_);
453 my ($doc_obj) = @_;
454
455 my $OID = $doc_obj->get_OID();
456 $OID = "NULL" unless defined $OID;
457
458 my $groupsize = $self->{'group_size'};
459 my $gs_count = $self->{'gs_count'};
460 my $open_new_file = (($gs_count % $groupsize)==0);
461 my $outhandle = $self->{'output_handle'};
462
463 # opening a new file, or document has assoicated files => directory needed
464 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
465
466 # The directory the archive file (doc.xml) and all associated files
467 # should end up in
468 my $doc_dir;
469 # If we've determined its time for a new file, open it now
470 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
471 {
472 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
473 # only if opening new file
474 my $output_dir = $self->get_output_dir();
475 &util::mk_all_dir ($output_dir) unless -e $output_dir;
476 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
477 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
478
479 if ($gs_count>0)
480 {
481 return if (!$self->close_file_output());
482 }
483
484 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
485
486
487 $self->{'gs_filename'} = $doc_file;
488 $self->{'short_doc_file'} = $short_doc_file;
489 $self->{'gs_OID'} = $OID;
490 $self->{'gs_doc_dir'} = $doc_dir;
491
492 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
493 }
494 # Otherwise load the same archive document directory used last time
495 else
496 {
497 $doc_dir = $self->{'gs_doc_dir'};
498 }
499
500 # copy all the associated files, add this information as metadata
501 # to the document
502 print $outhandle "Writing associated files to $doc_dir\n";
503 $self->process_assoc_files ($doc_obj, $doc_dir);
504
505 # look up 'gsdlmetafile' metadata and store that information
506 # explicitly in $doc_obj
507 $self->process_metafiles_metadata ($doc_obj);
508 }
509
510 # save this document
511 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
512 print GROUPPROCESS $section_text;
513
514 $self->{'gs_count'}++;
515}
516
517
518sub saveas {
519 my $self = shift (@_);
520
521 die "Basplug::saveas function must be implemented in sub classes\n";
522}
523
524sub get_doc_dir {
525 my $self = shift (@_);
526 my ($OID, $source_filename) = @_;
527
528 my $working_dir = $self->get_output_dir();
529 my $working_info = $self->{'output_info'};
530 return if (!defined $working_info);
531
532 my $doc_info = $working_info->get_info($OID);
533 my $doc_dir = '';
534
535 if (defined $doc_info && scalar(@$doc_info) >= 1)
536 {
537 # This OID already has an archives directory, so use it again
538 $doc_dir = $doc_info->[0];
539 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
540 }
541 elsif ($self->{'keep_import_structure'})
542 {
543 $source_filename = &File::Basename::dirname($source_filename);
544 $source_filename =~ s/[\\\/]+/\//g;
545 $source_filename =~ s/\/$//;
546
547 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
548 }
549
550 # We have to use a new archives directory for this document
551 if ($doc_dir eq "")
552 {
553 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
554 }
555
556 if (!defined $self->{'group'} || !$self->{'group'}){
557 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
558 }
559
560 return $doc_dir;
561}
562
563sub get_new_doc_dir{
564 my $self = shift (@_);
565 my($working_info,$working_dir,$OID) = @_;
566
567
568 my $doc_dir = "";
569 my $doc_dir_rest = $OID;
570
571 # remove any \ and / from the OID
572 $doc_dir_rest =~ s/[\\\/]//g;
573
574 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
575 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
576
577 my $doc_dir_num = 0;
578
579 do {
580 $doc_dir .= "/" if $doc_dir_num > 0;
581 if ($doc_dir_rest =~ s/^(.{1,8})//) {
582 $doc_dir .= $1;
583 $doc_dir_num++;
584 }
585 } while ($doc_dir_rest ne "" &&
586 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
587 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
588 my $i = 1;
589 my $doc_dir_base = $doc_dir;
590 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
591 $doc_dir = "$doc_dir_base-$i";
592 $i++;
593 }
594
595 return "$doc_dir.dir";
596}
597
598sub process_assoc_files {
599 my $self = shift (@_);
600 my ($doc_obj, $doc_dir, $handle) = @_;
601
602 my $outhandle = $self->{'output_handle'};
603
604 my $output_dir = $self->get_output_dir();
605 return if (!defined $output_dir);
606
607 &util::mk_all_dir ($output_dir) unless -e $output_dir;
608
609 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
610 &util::mk_all_dir ($working_dir) unless -e $working_dir;
611
612 my @assoc_files = ();
613 my $filename;;
614
615 my $source_filename = $doc_obj->get_source_filename();
616
617 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
618
619 if (defined $collect_dir) {
620 my $dirsep_regexp = &util::get_os_dirsep();
621
622 if ($collect_dir !~ /$dirsep_regexp$/) {
623 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
624 }
625
626 # This test is never going to fail on Windows -- is this a problem?
627
628 if ($source_filename !~ /^$dirsep_regexp/) {
629 $source_filename = &util::filename_cat($collect_dir, $source_filename);
630 }
631 }
632
633
634 # set the assocfile path (even if we have no assoc files - need this for lucene)
635 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
636 "assocfilepath",
637 "$doc_dir");
638 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
639 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
640 $dir = "" unless defined $dir;
641
642
643 my $real_filename = $assoc_file_rec->[0];
644 # for some reasons the image associate file has / before the full path
645 $real_filename =~ s/^\\(.*)/$1/i;
646 if (-e $real_filename) {
647
648 $filename = &util::filename_cat($working_dir, $afile);
649
650 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
651
652 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
653 "gsdlassocfile",
654 "$afile:$assoc_file_rec->[2]:$dir");
655 } elsif ($self->{'verbosity'} > 2) {
656 print $outhandle "BasePlugout::process couldn't copy the associated file " .
657 "$real_filename to $afile\n";
658 }
659 }
660}
661
662
663sub process_metafiles_metadata
664{
665 my $self = shift (@_);
666 my ($doc_obj) = @_;
667
668 my $top_section = $doc_obj->get_top_section();
669 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
670
671 foreach my $metafile_pair (@$metafiles) {
672 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
673
674 $doc_obj->metadata_file($full_metafile,$metafile);
675 }
676
677 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
678}
679
680sub archiveinf_files_to_field
681{
682 my $self = shift(@_);
683 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
684
685 foreach my $file_rec (@$files) {
686 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
687 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
688 # for some reasons the image associate file has / before the full path
689 $real_filename =~ s/^\\(.*)/$1/i;
690 if (-e $real_filename) {
691
692# if (defined $collect_dir) {
693# my $collect_dir_re_safe = $collect_dir;
694# $collect_dir_re_safe =~ s/\\/\\\\/g;
695# $collect_dir_re_safe =~ s/\./\\./g;##
696
697# $real_filename =~ s/^$collect_dir_re_safe//;
698# }
699
700 if (defined $reverse_lookups) {
701 $reverse_lookups->{$real_filename} = 1;
702 }
703 push(@{$oid_files->{$field}},$full_file);
704 }
705 else {
706 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
707 }
708 }
709}
710
711sub archiveinf_db
712{
713 my $self = shift (@_);
714 my ($doc_obj) = @_;
715
716 my $verbosity = $self->{'verbosity'};
717
718 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
719 if (defined $collect_dir) {
720 my $dirsep_regexp = &util::get_os_dirsep();
721
722 if ($collect_dir !~ /$dirsep_regexp$/) {
723 # ensure there is a slash at the end
724 $collect_dir .= &util::get_dirsep();
725 }
726 }
727
728 my $oid = $doc_obj->get_OID();
729 my $source_filename = $doc_obj->get_unmodified_source_filename();
730 my $working_info = $self->{'output_info'};
731 my $doc_info = $working_info->get_info($oid);
732
733 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
734 # doc_file is the path to the archive doc.xml. Make sure it has unix
735 # slashes, then if the collection is copied to linux, it can be built without reimport
736 $doc_file =~ s/\\/\//g;
737 my $oid_files = { 'doc-file' => $doc_file,
738 'index-status' => $index_status,
739 'src-file' => $source_filename,
740 'sort-meta' => $sortmeta,
741 'assoc-file' => [],
742 'meta-file' => [] };
743
744 my $reverse_lookups = { $source_filename => "1" };
745
746
747 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
748 $collect_dir,$oid_files,$reverse_lookups);
749
750
751 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
752 $collect_dir,$oid_files);
753
754 # Get the infodbtype value for this collection from the arcinfo object
755 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
756 my $output_dir = $self->{'output_dir'};
757
758 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
759 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
760
761 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
762
763 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
764 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
765 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
766 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
767
768 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
769 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
770 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
771
772 foreach my $rl (keys %$reverse_lookups) {
773 $working_info->add_reverseinfo($rl,$oid);
774 }
775
776 # meta files not set in reverese entry, but need to set the metadata flag
777 if (defined $doc_obj->get_meta_files()) {
778 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
779 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
780 $working_info->set_meta_file_flag($full_file);
781 }
782 }
783}
784
785
786sub set_sortmeta {
787 my $self = shift (@_);
788 my ($sortmeta, $removeprefix, $removesuffix) = @_;
789
790 $self->{'sortmeta'} = $sortmeta;
791 if (defined ($removeprefix) && $removeprefix ) {
792 $removeprefix =~ s/^\^//; # don't need a leading ^
793 $self->{'removeprefix'} = $removeprefix;
794 }
795 if (defined ($removesuffix) && $removesuffix) {
796 $removesuffix =~ s/\$$//; # don't need a trailing $
797 $self->{'removesuffix'} = $removesuffix;
798 }
799}
800
801sub open_xslt_pipe
802{
803 my $self = shift @_;
804 my ($output_file_name, $xslt_file)=@_;
805
806 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
807
808 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
809
810 my $mapping_file_path = "";
811
812 if ($ENV{'GSDLOS'} eq "windows"){
813 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
814 # this file:/// bit didn't work for me on windows XP
815 #$xslt_file = "\"file:///".$xslt_file."\"";
816 #$mapping_file_path = "\"file:///";
817 }
818 else{
819 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
820 }
821
822
823 $java_class_path = "\"".$java_class_path."\"";
824
825 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
826
827 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
828 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
829 $cmd .= "-m $mapping_file_path";
830 }
831
832 open(*XMLWRITER, $cmd)
833 or die "can't open pipe to xslt: $!";
834
835
836 $self->{'xslt_writer'} = *XMLWRITER;
837
838 print XMLWRITER "<?DocStart?>\n";
839 print XMLWRITER "$output_file_name\n";
840
841
842 }
843
844
845sub close_xslt_pipe
846{
847 my $self = shift @_;
848
849
850 return unless defined $self->{'xslt_writer'} ;
851
852 my $xsltwriter = $self->{'xslt_writer'};
853
854 print $xsltwriter "<?DocEnd?>\n";
855 close($xsltwriter);
856
857 undef $self->{'xslt_writer'};
858
859}
860
861sub close_file_output
862{
863 my ($self) = @_;
864
865 # make sure that the handle has been opened - it won't be if we failed
866 # to import any documents...
867 if (defined(fileno(GROUPPROCESS))) {
868 $self->output_xml_footer('GROUPPROCESS','Archive');
869 close GROUPPROCESS;
870 }
871
872 my $OID = $self->{'gs_OID'};
873 my $short_doc_file = $self->{'short_doc_file'};
874
875 if ($self->{'gzip'}) {
876 my $doc_file = $self->{'gs_filename'};
877 `gzip $doc_file`;
878 $doc_file .= ".gz";
879 $short_doc_file .= ".gz";
880 if (!-e $doc_file) {
881 my $outhandle = $self->{'output_handle'};
882 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
883 return 0;
884 }
885 }
886
887 # store reference in output_info
888 my $output_info = $self->{'output_info'};
889 return 0 if (!defined $output_info);
890 $output_info->add_info($OID, $short_doc_file, undef, undef);
891 return 1;
892}
893
894
895#the subclass should implement this method if is_group method could return 1.
896sub close_group_output{
897 my $self = shift (@_);
898}
899
900sub is_group {
901 my $self = shift (@_);
902 return 0;
903}
904
905my $dc_set = { Title => 1,
906 Creator => 1,
907 Subject => 1,
908 Description => 1,
909 Publisher => 1,
910 Contributor => 1,
911 Date => 1,
912 Type => 1,
913 Format => 1,
914 Identifier => 1,
915 Source => 1,
916 Language => 1,
917 Relation => 1,
918 Coverage => 1,
919 Rights => 1};
920
921
922# returns an XML representation of the dublin core metadata
923# if dc meta is not found, try ex mete
924sub get_dc_metadata {
925 my $self = shift(@_);
926 my ($doc_obj, $section, $version) = @_;
927
928 # build up string of dublin core metadata
929 $section="" unless defined $section;
930
931 my $section_ptr = $doc_obj->_lookup_section($section);
932 return "" unless defined $section_ptr;
933
934
935 my $explicit_dc = {};
936 my $explicit_ex = {};
937
938 my $all_text="";
939 foreach my $data (@{$section_ptr->{'metadata'}}){
940 my $escaped_value = &docprint::escape_text($data->[1]);
941 if ($data->[0]=~ m/^dc\./) {
942 $data->[0] =~ tr/[A-Z]/[a-z]/;
943
944 $data->[0] =~ m/^dc\.(.*)/;
945 my $dc_element = $1;
946
947 if (!defined $explicit_dc->{$dc_element}) {
948 $explicit_dc->{$dc_element} = [];
949 }
950 push(@{$explicit_dc->{$dc_element}},$escaped_value);
951
952 if (defined $version && ($version eq "oai_dc")) {
953 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
954 }
955 else {
956 # qualifier???
957 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
958 }
959
960 }
961 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
962 $data->[0] =~ m/^(ex\.)?(.*)/;
963 my $ex_element = $2;
964 my $lc_ex_element = lc($ex_element);
965
966 if (defined $dc_set->{$ex_element}) {
967 if (!defined $explicit_ex->{$lc_ex_element}) {
968 $explicit_ex->{$lc_ex_element} = [];
969 }
970 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
971 }
972 }
973 }
974
975 # go through dc_set and for any element *not* defined in explicit_dc
976 # that does exist in explicit_ex, add it in as metadata
977 foreach my $k ( keys %$dc_set ) {
978 my $lc_k = lc($k);
979
980 if (!defined $explicit_dc->{$lc_k}) {
981 if (defined $explicit_ex->{$lc_k}) {
982
983 foreach my $v (@{$explicit_ex->{$lc_k}}) {
984 my $dc_element = $lc_k;
985 my $escaped_value = $v;
986
987 if (defined $version && ($version eq "oai_dc")) {
988 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
989 }
990 else {
991 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
992 }
993
994 }
995 }
996 }
997 }
998
999 if ($all_text eq "") {
1000 $all_text .= " There is no Dublin Core metatdata in this document\n";
1001 }
1002 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1003
1004 return $all_text;
1005}
1006
1007# Build up dublin_core metadata. Priority given to dc.* over ex.*
1008# This method was apparently added by Jeffrey and committed by Shaoqun.
1009# But we don't know why it was added, so not using it anymore.
1010sub new_get_dc_metadata {
1011
1012 my $self = shift(@_);
1013 my ($doc_obj, $section, $version) = @_;
1014
1015 # build up string of dublin core metadata
1016 $section="" unless defined $section;
1017
1018 my $section_ptr=$doc_obj->_lookup_section($section);
1019 return "" unless defined $section_ptr;
1020
1021 my $all_text = "";
1022 foreach my $data (@{$section_ptr->{'metadata'}}){
1023 my $escaped_value = &docprint::escape_text($data->[1]);
1024 my $dc_element = $data->[0];
1025
1026 my @array = split('\.',$dc_element);
1027 my ($type,$name);
1028
1029 if(defined $array[1])
1030 {
1031 $type = $array[0];
1032 $name = $array[1];
1033 }
1034 else
1035 {
1036 $type = "ex";
1037 $name = $array[0];
1038 }
1039
1040 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1041 }
1042 return $all_text;
1043}
1044
1045
10461;
Note: See TracBrowser for help on using the repository browser.