source: gsdl/trunk/perllib/plugouts/BasePlugout.pm@ 18528

Last change on this file since 18528 was 18528, checked in by davidb, 15 years ago

OIDmetadata wasn't supported in collect.cfg, but OIDtype was. Now rectified. Also introduced OIDcount as a file saved in the archives folder to help doc.pm use the correct value when working incrementally

  • Property svn:keywords set to Author Date Id Revision
File size: 26.3 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use gsprintf 'gsprintf';
35use printusage;
36use parse2;
37use GDBMUtils;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45 { 'name' => "group_size",
46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'hiddengli' => "no"},
61 { 'name' => "output_handle",
62 'desc' => "{BasPlugout.output_handle}",
63 'type' => "string",
64 'deft' => 'STDERR',
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "verbosity",
68 'desc' => "{BasPlugout.verbosity}",
69 'type' => "int",
70 'deft' => "0",
71 'reqd' => "no",
72 'hiddengli' => "no"},
73 { 'name' => "gzip_output",
74 'desc' => "{BasPlugout.gzip_output}",
75 'type' => "flag",
76 'reqd' => "no",
77 'hiddengli' => "no"},
78 { 'name' => "debug",
79 'desc' => "{BasPlugout.debug}",
80 'type' => "flag",
81 'reqd' => "no",
82 'hiddengli' => "yes"}
83];
84
85my $options = { 'name' => "BasePlugout",
86 'desc' => "{BasPlugout.desc}",
87 'abstract' => "yes",
88 'inherits' => "no",
89 'args' => $arguments};
90
91sub new
92{
93 my $class = shift (@_);
94
95 my ($plugoutlist,$args,$hashArgOptLists) = @_;
96 push(@$plugoutlist, $class);
97
98 my $strPlugoutName = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
99
100 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
101 push(@{$hashArgOptLists->{"OptList"}},$options);
102
103 my $self = {};
104 $self->{'plugout_type'} = $class;
105 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
106 $self->{"info_only"} = 0;
107
108 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
109 # the args, just return the object.
110 foreach my $strArg (@{$args})
111 {
112 if(defined $strArg && $strArg eq "-gsdlinfo")
113 {
114 $self->{"info_only"} = 1;
115 return bless $self, $class;
116 }
117 }
118
119 delete $self->{"info_only"};
120
121 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
122 {
123 my $classTempClass = bless $self, $class;
124 print STDERR "<BadPlugout d=$self->{'plugout_name'}>\n";
125 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $self->{'plugout_name'});
126 $classTempClass->print_txt_usage(""); # Use default resource bundle
127 die "\n";
128 }
129
130
131 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
132 {
133 ##$self->{'xslt_file'} =~ s/\"//g;##working on Windows???
134 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n" and die "\n" unless (-e $self->{'xslt_file'});
135 }
136
137 $self->{'gs_count'} = 0;
138
139 $self->{'keep_import_structure'} = 0;
140
141 return bless $self, $class;
142
143}
144
145sub print_xml_usage
146{
147 my $self = shift(@_);
148 my $header = shift(@_);
149 my $high_level_information_only = shift(@_);
150
151 # XML output is always in UTF-8
152 gsprintf::output_strings_in_UTF8;
153
154 if ($header) {
155 &PrintUsage::print_xml_header("plugout");
156 }
157 $self->print_xml($high_level_information_only);
158}
159
160
161sub print_xml
162{
163 my $self = shift(@_);
164 my $high_level_information_only = shift(@_);
165
166 my $optionlistref = $self->{'option_list'};
167 my @optionlist = @$optionlistref;
168 my $plugoutoptions = shift(@$optionlistref);
169 return if (!defined($plugoutoptions));
170
171 gsprintf(STDERR, "<PlugoutInfo>\n");
172 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
173 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
174 $desc =~ s/</&amp;lt;/g; # doubly escaped
175 $desc =~ s/>/&amp;gt;/g;
176 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
177 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
178 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
179 unless (defined($high_level_information_only)) {
180 gsprintf(STDERR, " <Arguments>\n");
181 if (defined($plugoutoptions->{'args'})) {
182 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
183 }
184 gsprintf(STDERR, " </Arguments>\n");
185
186 # Recurse up the plugout hierarchy
187 $self->print_xml();
188 }
189 gsprintf(STDERR, "</PlugoutInfo>\n");
190}
191
192
193sub print_txt_usage
194{
195 my $self = shift(@_);
196
197 # Print the usage message for a plugout (recursively)
198 my $descoffset = $self->determine_description_offset(0);
199 $self->print_plugout_usage($descoffset, 1);
200}
201
202sub determine_description_offset
203{
204 my $self = shift(@_);
205 my $maxoffset = shift(@_);
206
207 my $optionlistref = $self->{'option_list'};
208 my @optionlist = @$optionlistref;
209 my $plugoutoptions = pop(@$optionlistref);
210 return $maxoffset if (!defined($plugoutoptions));
211
212 # Find the length of the longest option string of this download
213 my $plugoutargs = $plugoutoptions->{'args'};
214 if (defined($plugoutargs)) {
215 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
216 if ($longest > $maxoffset) {
217 $maxoffset = $longest;
218 }
219 }
220
221 # Recurse up the download hierarchy
222 $maxoffset = $self->determine_description_offset($maxoffset);
223 $self->{'option_list'} = \@optionlist;
224 return $maxoffset;
225}
226
227
228sub print_plugout_usage
229{
230 my $self = shift(@_);
231 my $descoffset = shift(@_);
232 my $isleafclass = shift(@_);
233
234 my $optionlistref = $self->{'option_list'};
235 my @optionlist = @$optionlistref;
236 my $plugoutoptions = shift(@$optionlistref);
237 return if (!defined($plugoutoptions));
238
239 my $plugoutname = $plugoutoptions->{'name'};
240 my $plugoutargs = $plugoutoptions->{'args'};
241 my $plugoutdesc = $plugoutoptions->{'desc'};
242
243 # Produce the usage information using the data structure above
244 if ($isleafclass) {
245 if (defined($plugoutdesc)) {
246 gsprintf(STDERR, "$plugoutdesc\n\n");
247 }
248 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
249 }
250
251 # Display the download options, if there are some
252 if (defined($plugoutargs)) {
253 # Calculate the column offset of the option descriptions
254 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
255
256 if ($isleafclass) {
257 gsprintf(STDERR, " {common.specific_options}:\n");
258 }
259 else {
260 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
261 }
262
263 # Display the download options
264 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
265 }
266
267 # Recurse up the download hierarchy
268 $self->print_plugout_usage($descoffset, 0);
269 $self->{'option_list'} = \@optionlist;
270}
271
272
273sub error
274{
275 my ($strFunctionName,$strError) = @_;
276 {
277 print "Error occoured in BasePlugout.pm\n".
278 "In Function: ".$strFunctionName."\n".
279 "Error Message: ".$strError."\n";
280 exit(-1);
281 }
282}
283
284# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
285sub set_OIDtype {
286 my $self = shift (@_);
287 my ($type, $metadata) = @_;
288
289 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
290 $self->{'OIDtype'} = $type;
291 } else {
292 $self->{'OIDtype'} = "hash";
293 }
294 if ($type =~ /^assigned$/) {
295 if (defined $metadata) {
296 $self->{'OIDmetadata'} = $metadata;
297 } else {
298 $self->{'OIDmetadata'} = "dc.Identifier";
299 }
300 }
301}
302
303sub set_output_dir
304{
305 my $self = shift @_;
306 my ($output_dir) = @_;
307
308 $self->{'output_dir'} = $output_dir;
309}
310
311sub setoutputdir
312{
313 my $self = shift @_;
314 my ($output_dir) = @_;
315
316 $self->{'output_dir'} = $output_dir;
317}
318
319sub get_output_dir
320{
321 my $self = shift (@_);
322
323 return $self->{'output_dir'};
324}
325
326sub getoutputdir
327{
328 my $self = shift (@_);
329
330 return $self->{'output_dir'};
331}
332
333sub getoutputinfo
334{
335 my $self = shift (@_);
336
337 return $self->{'output_info'};
338}
339
340
341sub get_output_handler
342{
343 my $self = shift (@_);
344
345 my ($output_file_name) = @_;
346
347 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
348
349 return *OUTPUT;
350}
351
352sub release_output_handler
353{
354 my $self = shift (@_);
355 my ($outhandler) = @_;
356
357 close($outhandler);
358
359}
360
361sub output_xml_header {
362 my $self = shift (@_);
363 my ($handle,$docroot,$nondoctype) = @_;
364
365 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
366
367 if (!defined $nondoctype){
368 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
369 }
370
371 print $handle "<$docroot>\n" if defined $docroot;
372}
373
374sub output_xml_footer {
375 my $self = shift (@_);
376 my ($handle,$docroot) = @_;
377 print $handle "</$docroot>\n" if defined $docroot;
378}
379
380sub process {
381 my $self = shift (@_);
382 my ($doc_obj) = @_;
383
384 $doc_obj->set_lastmodified();
385
386 if ($self->{'group_size'} > 1) {
387 $self->group_process ($doc_obj);
388 return;
389 }
390
391 my $OID = $doc_obj->get_OID();
392 $OID = "NULL" unless defined $OID;
393
394 my $top_section = $doc_obj->get_top_section();
395
396 #get document's directory
397 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
398
399 my $output_info = $self->{'output_info'};
400 return if (!defined $output_info);
401
402 ##############################
403 # call subclass' saveas method
404 ##############################
405 $self->saveas($doc_obj,$doc_dir);
406 $self->archiveinf_gdbm($doc_obj,$doc_dir);
407
408}
409
410sub store_output_info_reference {
411 my $self = shift (@_);
412 my ($doc_obj) = @_;
413
414 my $output_info = $self->{'output_info'};
415 my $metaname = $self->{'sortmeta'};
416 if (!defined $metaname || $metaname !~ /\S/) {
417 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
418 return;
419 }
420
421 my $metadata = "";
422 my $top_section = $doc_obj->get_top_section();
423
424 my @commameta_list = split(/,/, $metaname);
425 foreach my $cmn (@commameta_list) {
426 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
427 if ($meta) {
428 # do remove prefix/suffix - this will apply to all values
429 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
430 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
431 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
432 $metadata .= $meta if ($meta);
433 }
434 }
435
436 # store reference in the output_info
437 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
438
439}
440
441sub group_process {
442
443 my $self = shift (@_);
444 my ($doc_obj) = @_;
445
446 my $OID = $doc_obj->get_OID();
447 $OID = "NULL" unless defined $OID;
448
449 my $groupsize = $self->{'group_size'};
450 my $gs_count = $self->{'gs_count'};
451 my $open_new_file = (($gs_count % $groupsize)==0);
452 my $outhandle = $self->{'output_handle'};
453
454 # opening a new file, or document has assoicated files => directory needed
455 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
456
457 # The directory the archive file (doc.xml) and all associated files
458 # should end up in
459 my $doc_dir;
460 # If we've determined its time for a new file, open it now
461 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
462 {
463 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
464 # only if opening new file
465 my $output_dir = $self->get_output_dir();
466 &util::mk_all_dir ($output_dir) unless -e $output_dir;
467 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
468 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
469
470 if ($gs_count>0)
471 {
472 return if (!$self->close_file_output());
473 }
474
475 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
476
477
478 $self->{'gs_filename'} = $doc_file;
479 $self->{'short_doc_file'} = $short_doc_file;
480 $self->{'gs_OID'} = $OID;
481 $self->{'gs_doc_dir'} = $doc_dir;
482
483 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
484 }
485 # Otherwise load the same archive document directory used last time
486 else
487 {
488 $doc_dir = $self->{'gs_doc_dir'};
489 }
490
491 # copy all the associated files, add this information as metadata
492 # to the document
493 print $outhandle "Writing associated files to $doc_dir\n";
494 $self->process_assoc_files ($doc_obj, $doc_dir);
495 }
496
497 # save this document
498 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
499 print GROUPPROCESS $section_text;
500
501 $self->{'gs_count'}++;
502}
503
504
505sub saveas {
506 my $self = shift (@_);
507
508 die "Basplug::saveas function must be implemented in sub classes\n";
509}
510
511sub get_doc_dir {
512 my $self = shift (@_);
513 my ($OID, $source_filename) = @_;
514
515 my $working_dir = $self->get_output_dir();
516 my $working_info = $self->{output_info};
517 return if (!defined $working_info);
518
519 my $doc_info = $working_info->get_info($OID);
520 my $doc_dir = '';
521
522 if (defined $doc_info && scalar(@$doc_info) >= 1)
523 {
524 # This OID already has an archives directory, so use it again
525 $doc_dir = $doc_info->[0];
526 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
527 }
528 elsif ($self->{'keep_import_structure'})
529 {
530 $source_filename = &File::Basename::dirname($source_filename);
531 $source_filename =~ s/[\\\/]+/\//g;
532 $source_filename =~ s/\/$//;
533
534 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
535 }
536
537 # We have to use a new archives directory for this document
538 if ($doc_dir eq "")
539 {
540 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
541 }
542
543 if (!defined $self->{'group'} || !$self->{'group'}){
544 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
545 }
546
547 return $doc_dir;
548}
549
550sub get_new_doc_dir{
551 my $self = shift (@_);
552 my($working_info,$working_dir,$OID) = @_;
553
554 my $doc_dir = "";
555 my $doc_dir_rest = $OID;
556 my $doc_dir_num = 0;
557
558 do {
559 $doc_dir .= "/" if $doc_dir_num > 0;
560 if ($doc_dir_rest =~ s/^(.{1,8})//) {
561 $doc_dir .= $1;
562 $doc_dir_num++;
563 }
564 } while ($doc_dir_rest ne "" &&
565 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
566 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
567 my $i = 1;
568 my $doc_dir_base = $doc_dir;
569 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
570 $doc_dir = "$doc_dir_base-$i";
571 $i++;
572 }
573
574 return "$doc_dir.dir";
575}
576
577sub process_assoc_files {
578 my $self = shift (@_);
579 my ($doc_obj, $doc_dir, $handle) = @_;
580
581 my $outhandle = $self->{'output_handle'};
582
583 my $output_dir = $self->get_output_dir();
584 return if (!defined $output_dir);
585
586 &util::mk_all_dir ($output_dir) unless -e $output_dir;
587
588 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
589 &util::mk_all_dir ($working_dir) unless -e $working_dir;
590
591 my @assoc_files = ();
592 my $filename;;
593
594 my $source_filename = $doc_obj->get_source_filename();
595
596 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
597
598 if (defined $collect_dir) {
599 my $dirsep_regexp = &util::get_os_dirsep();
600
601 if ($collect_dir !~ /$dirsep_regexp$/) {
602 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
603 }
604
605 # This test is never going to fail on Windows -- is this a problem?
606
607 if ($source_filename !~ /^$dirsep_regexp/) {
608 $source_filename = &util::filename_cat($collect_dir, $source_filename);
609 }
610 }
611
612
613 # set the assocfile path (even if we have no assoc files - need this for lucene)
614 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
615 "assocfilepath",
616 "$doc_dir");
617 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
618 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
619 $dir = "" unless defined $dir;
620
621
622 my $real_filename = $assoc_file_rec->[0];
623 # for some reasons the image associate file has / before the full path
624 $real_filename =~ s/^\\(.*)/$1/i;
625 if (-e $real_filename) {
626
627 $filename = &util::filename_cat($working_dir, $afile);
628
629 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
630
631 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
632 "gsdlassocfile",
633 "$afile:$assoc_file_rec->[2]:$dir");
634 } elsif ($self->{'verbosity'} > 2) {
635 print $outhandle "BasePlugout::process couldn't copy the associated file " .
636 "$real_filename to $afile\n";
637 }
638 }
639}
640
641
642sub archiveinf_gdbm
643{
644 my $self = shift (@_);
645 my ($doc_obj) = @_;
646
647 my $verbosity = $self->{'verbosity'};
648
649 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
650 if (defined $collect_dir) {
651 my $dirsep_regexp = &util::get_os_dirsep();
652
653 if ($collect_dir !~ /$dirsep_regexp$/) {
654 # ensure there is a slash at the end
655 $collect_dir .= &util::get_dirsep();
656 }
657 }
658
659 my $oid = $doc_obj->get_OID();
660 my $source_filename = $doc_obj->get_source_filename();
661
662 my $working_info = $self->{'output_info'};
663 my $doc_info = $working_info->get_info($oid);
664 my ($doc_file,$index_status) = @$doc_info;
665
666 my $oid_files = { 'doc-file' => $doc_file,
667 'index-status' => $index_status,
668 'src-file' => $source_filename,
669 'assoc-files' => [] };
670
671 my %reverse_lookups = ( $source_filename => "1" );
672
673 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
674 my $real_filename = $assoc_file_rec->[0];
675 my $full_afile = $assoc_file_rec->[1];
676
677 # for some reasons the image associate file has / before the full path
678 $real_filename =~ s/^\\(.*)/$1/i;
679 if (-e $real_filename) {
680
681 if (defined $collect_dir) {
682 my $collect_dir_re_safe = $collect_dir;
683 $collect_dir_re_safe =~ s/\\/\\\\/g;
684 $collect_dir_re_safe =~ s/\./\\./g;
685
686 $real_filename =~ s/^$collect_dir_re_safe//;
687 }
688
689 $reverse_lookups{$real_filename} = 1;
690
691 push(@{$oid_files->{'assoc-files'}},$full_afile);
692
693 }
694 else {
695 print STDERR "Warning: archiveinf_gdbm()\n $real_filename does not appear to be on the file system\n";
696 }
697 }
698
699 # better not to commit to a particular db implementation, but
700 # for simplicity, will use GDBM for now.
701
702 my $output_dir = $self->{'output_dir'};
703 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
704
705 my $doc_db = &util::filename_cat($output_dir,"archiveinf-doc$db_ext");
706 my $src_db = &util::filename_cat($output_dir,"archiveinf-src$db_ext");
707
708 my $doc_db_text = "";
709 $doc_db_text .= "<doc-file>$oid_files->{'doc-file'}\n";
710 $doc_db_text .= "<index-status>$oid_files->{'index-status'}\n";
711 $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n";
712 foreach my $af (@{$oid_files->{'assoc-files'}}) {
713 $doc_db_text .= "<assoc-file>$af\n";
714 }
715 chomp($doc_db_text); # remove trailing \n
716
717 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
718 &GDBMUtils::gdbmDatabaseSet($doc_db,$oid,$doc_db_text);
719
720 foreach my $rl (keys %reverse_lookups) {
721 &GDBMUtils::gdbmDatabaseAppend($src_db,$rl,"<oid>$oid\n");
722 }
723
724}
725
726
727sub set_sortmeta {
728 my $self = shift (@_);
729 my ($sortmeta, $removeprefix, $removesuffix) = @_;
730
731 $self->{'sortmeta'} = $sortmeta;
732 if (defined ($removeprefix) && $removeprefix ) {
733 $removeprefix =~ s/^\^//; # don't need a leading ^
734 $self->{'removeprefix'} = $removeprefix;
735 }
736 if (defined ($removesuffix) && $removesuffix) {
737 $removesuffix =~ s/\$$//; # don't need a trailing $
738 $self->{'removesuffix'} = $removesuffix;
739 }
740}
741
742sub open_xslt_pipe
743{
744 my $self = shift @_;
745 my ($output_file_name, $xslt_file)=@_;
746
747 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
748
749 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java");
750
751 my $mapping_file_path = "";
752
753 if ($ENV{'GSDLOS'} eq "windows"){
754 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
755 $xslt_file = "\"file:///".$xslt_file."\"";
756 $mapping_file_path = "\"file:///";
757 }
758 else{
759 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
760 }
761
762
763 $java_class_path = "\"".$java_class_path."\"";
764
765 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t $xslt_file ";
766
767 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
768 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
769 $cmd .= "-m $mapping_file_path";
770 }
771
772 open(*XMLWRITER, $cmd)
773 or die "can't open pipe to xslt: $!";
774
775
776 $self->{'xslt_writer'} = *XMLWRITER;
777
778 print XMLWRITER "<?DocStart?>\n";
779 print XMLWRITER "$output_file_name\n";
780
781
782 }
783
784
785sub close_xslt_pipe
786{
787 my $self = shift @_;
788
789
790 return unless defined $self->{'xslt_writer'} ;
791
792 my $xsltwriter = $self->{'xslt_writer'};
793
794 print $xsltwriter "<?DocEnd?>\n";
795 close($xsltwriter);
796
797 undef $self->{'xslt_writer'};
798
799}
800
801sub close_file_output
802{
803 my ($self) = @_;
804
805 # make sure that the handle has been opened - it won't be if we failed
806 # to import any documents...
807 if (defined(fileno(GROUPPROCESS))) {
808 $self->output_xml_footer('GROUPPROCESS','Archive');
809 close GROUPPROCESS;
810 }
811
812 my $OID = $self->{'gs_OID'};
813 my $short_doc_file = $self->{'short_doc_file'};
814
815 if ($self->{'gzip'}) {
816 my $doc_file = $self->{'gs_filename'};
817 `gzip $doc_file`;
818 $doc_file .= ".gz";
819 $short_doc_file .= ".gz";
820 if (!-e $doc_file) {
821 my $outhandle = $self->{'output_handle'};
822 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
823 return 0;
824 }
825 }
826
827 # store reference in output_info
828 my $output_info = $self->{'output_info'};
829 return 0 if (!defined $output_info);
830 $output_info->add_info($OID, $short_doc_file, undef, undef);
831 return 1;
832}
833
834
835#the subclass should implement this method if is_group method could return 1.
836sub close_group_output{
837 my $self = shift (@_);
838}
839
840sub is_group {
841 my $self = shift (@_);
842 return 0;
843}
844
845my $dc_set = { Title => 1,
846 Creator => 1,
847 Subject => 1,
848 Description => 1,
849 Publisher => 1,
850 Contributor => 1,
851 Date => 1,
852 Type => 1,
853 Format => 1,
854 Identifier => 1,
855 Source => 1,
856 Language => 1,
857 Relation => 1,
858 Coverage => 1,
859 Rights => 1};
860
861
862# returns an XML representation of the dublin core metadata
863# if dc meta is not found, try ex mete
864sub get_dc_metadata {
865 my $self = shift(@_);
866 my ($doc_obj, $section, $version) = @_;
867
868 # build up string of dublin core metadata
869 $section="" unless defined $section;
870
871 my $section_ptr = $doc_obj->_lookup_section($section);
872 return "" unless defined $section_ptr;
873
874
875 my $explicit_dc = {};
876 my $explicit_ex = {};
877
878 my $all_text="";
879 foreach my $data (@{$section_ptr->{'metadata'}}){
880 my $escaped_value = &docprint::escape_text($data->[1]);
881 if ($data->[0]=~ m/^dc\./) {
882 $data->[0] =~ tr/[A-Z]/[a-z]/;
883
884 $data->[0] =~ m/^dc\.(.*)/;
885 my $dc_element = $1;
886
887 if (!defined $explicit_dc->{$dc_element}) {
888 $explicit_dc->{$dc_element} = [];
889 }
890 push(@{$explicit_dc->{$dc_element}},$escaped_value);
891
892 if (defined $version && ($version eq "oai_dc")) {
893 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
894 }
895 else {
896 # qualifier???
897 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
898 }
899
900 }
901 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
902 $data->[0] =~ m/^(ex\.)?(.*)/;
903 my $ex_element = $2;
904 my $lc_ex_element = lc($ex_element);
905
906 if (defined $dc_set->{$ex_element}) {
907 if (!defined $explicit_ex->{$lc_ex_element}) {
908 $explicit_ex->{$lc_ex_element} = [];
909 }
910 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
911 }
912 }
913 }
914
915 # go through dc_set and for any element *not* defined in explicit_dc
916 # that does exist in explicit_ex, add it in as metadata
917 foreach my $k ( keys %$dc_set ) {
918 my $lc_k = lc($k);
919
920 if (!defined $explicit_dc->{$lc_k}) {
921 if (defined $explicit_ex->{$lc_k}) {
922
923 foreach my $v (@{$explicit_ex->{$lc_k}}) {
924 my $dc_element = $lc_k;
925 my $escaped_value = $v;
926
927 if (defined $version && ($version eq "oai_dc")) {
928 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
929 }
930 else {
931 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
932 }
933
934 }
935 }
936 }
937 }
938
939 if ($all_text eq "") {
940 $all_text .= " There is no Dublin Core metatdata in this document\n";
941 }
942 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
943
944 return $all_text;
945}
946
947# Build up dublin_core metadata. Priority given to dc.* over ex.*
948# This method was apparently added by Jeffrey and committed by Shaoqun.
949# But we don't know why it was added, so not using it anymore.
950sub new_get_dc_metadata {
951
952 my $self = shift(@_);
953 my ($doc_obj, $section, $version) = @_;
954
955 # build up string of dublin core metadata
956 $section="" unless defined $section;
957
958 my $section_ptr=$doc_obj->_lookup_section($section);
959 return "" unless defined $section_ptr;
960
961 my $all_text = "";
962 foreach my $data (@{$section_ptr->{'metadata'}}){
963 my $escaped_value = &docprint::escape_text($data->[1]);
964 my $dc_element = $data->[0];
965
966 my @array = split('\.',$dc_element);
967 my ($type,$name);
968
969 if(defined $array[1])
970 {
971 $type = $array[0];
972 $name = $array[1];
973 }
974 else
975 {
976 $type = "ex";
977 $name = $array[0];
978 }
979
980 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
981 }
982 return $all_text;
983}
984
985
9861;
Note: See TracBrowser for help on using the repository browser.