source: gsdl/trunk/perllib/plugouts/BasePlugout.pm@ 19829

Last change on this file since 19829 was 19829, checked in by davidb, 15 years ago

doc.pm API extended to include call for finding out the original source filename (rather than the one where the rename_method has been applied). Useful for incremental building, and probably other things too

  • Property svn:keywords set to Author Date Id Revision
File size: 28.6 KB
RevLine 
[12330]1###########################################################################
2#
[17202]3# BasePlugout.pm -- base class for all the plugout modules
[12330]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[17202]26package BasePlugout;
[12330]27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
[12459]32no strict 'refs';
[12330]33
34use gsprintf 'gsprintf';
35use printusage;
[12546]36use parse2;
[17087]37use GDBMUtils;
[12330]38
[17087]39
[12330]40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
[13172]45 { 'name' => "group_size",
[12330]46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'hiddengli' => "no"},
61 { 'name' => "output_handle",
62 'desc' => "{BasPlugout.output_handle}",
63 'type' => "string",
64 'deft' => 'STDERR',
65 'reqd' => "no",
66 'hiddengli' => "yes"},
67 { 'name' => "verbosity",
68 'desc' => "{BasPlugout.verbosity}",
69 'type' => "int",
70 'deft' => "0",
71 'reqd' => "no",
72 'hiddengli' => "no"},
73 { 'name' => "gzip_output",
74 'desc' => "{BasPlugout.gzip_output}",
75 'type' => "flag",
76 'reqd' => "no",
[13172]77 'hiddengli' => "no"},
78 { 'name' => "debug",
79 'desc' => "{BasPlugout.debug}",
80 'type' => "flag",
81 'reqd' => "no",
82 'hiddengli' => "yes"}
[12330]83];
84
[17202]85my $options = { 'name' => "BasePlugout",
[12330]86 'desc' => "{BasPlugout.desc}",
87 'abstract' => "yes",
88 'inherits' => "no",
89 'args' => $arguments};
90
91sub new
92{
93 my $class = shift (@_);
94
95 my ($plugoutlist,$args,$hashArgOptLists) = @_;
96 push(@$plugoutlist, $class);
97
98 my $strPlugoutName = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
99
[17202]100 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
101 push(@{$hashArgOptLists->{"OptList"}},$options);
[12330]102
103 my $self = {};
104 $self->{'plugout_type'} = $class;
105 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
106 $self->{"info_only"} = 0;
107
108 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
109 # the args, just return the object.
110 foreach my $strArg (@{$args})
111 {
112 if(defined $strArg && $strArg eq "-gsdlinfo")
113 {
114 $self->{"info_only"} = 1;
115 return bless $self, $class;
116 }
117 }
118
119 delete $self->{"info_only"};
120
[12546]121 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[12330]122 {
123 my $classTempClass = bless $self, $class;
124 print STDERR "<BadPlugout d=$self->{'plugout_name'}>\n";
125 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $self->{'plugout_name'});
126 $classTempClass->print_txt_usage(""); # Use default resource bundle
127 die "\n";
128 }
129
130
[12603]131 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
[12330]132 {
133 ##$self->{'xslt_file'} =~ s/\"//g;##working on Windows???
134 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n" and die "\n" unless (-e $self->{'xslt_file'});
135 }
136
137 $self->{'gs_count'} = 0;
138
[12459]139 $self->{'keep_import_structure'} = 0;
[12330]140
141 return bless $self, $class;
142
143}
144
145sub print_xml_usage
146{
147 my $self = shift(@_);
148 my $header = shift(@_);
[12628]149 my $high_level_information_only = shift(@_);
[12330]150
151 # XML output is always in UTF-8
152 gsprintf::output_strings_in_UTF8;
153
154 if ($header) {
155 &PrintUsage::print_xml_header("plugout");
156 }
[12628]157 $self->print_xml($high_level_information_only);
[12330]158}
159
160
161sub print_xml
162{
163 my $self = shift(@_);
[12628]164 my $high_level_information_only = shift(@_);
165
[12330]166 my $optionlistref = $self->{'option_list'};
167 my @optionlist = @$optionlistref;
168 my $plugoutoptions = shift(@$optionlistref);
169 return if (!defined($plugoutoptions));
170
171 gsprintf(STDERR, "<PlugoutInfo>\n");
172 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
173 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
174 $desc =~ s/</&amp;lt;/g; # doubly escaped
175 $desc =~ s/>/&amp;gt;/g;
176 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
177 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
178 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
[12628]179 unless (defined($high_level_information_only)) {
180 gsprintf(STDERR, " <Arguments>\n");
181 if (defined($plugoutoptions->{'args'})) {
182 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
183 }
184 gsprintf(STDERR, " </Arguments>\n");
[12330]185
[12628]186 # Recurse up the plugout hierarchy
187 $self->print_xml();
[12330]188 }
189 gsprintf(STDERR, "</PlugoutInfo>\n");
190}
191
192
193sub print_txt_usage
194{
195 my $self = shift(@_);
196
197 # Print the usage message for a plugout (recursively)
198 my $descoffset = $self->determine_description_offset(0);
199 $self->print_plugout_usage($descoffset, 1);
200}
201
202sub determine_description_offset
203{
204 my $self = shift(@_);
205 my $maxoffset = shift(@_);
206
207 my $optionlistref = $self->{'option_list'};
208 my @optionlist = @$optionlistref;
209 my $plugoutoptions = pop(@$optionlistref);
210 return $maxoffset if (!defined($plugoutoptions));
211
212 # Find the length of the longest option string of this download
213 my $plugoutargs = $plugoutoptions->{'args'};
214 if (defined($plugoutargs)) {
215 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
216 if ($longest > $maxoffset) {
217 $maxoffset = $longest;
218 }
219 }
220
221 # Recurse up the download hierarchy
222 $maxoffset = $self->determine_description_offset($maxoffset);
223 $self->{'option_list'} = \@optionlist;
224 return $maxoffset;
225}
226
227
228sub print_plugout_usage
229{
230 my $self = shift(@_);
231 my $descoffset = shift(@_);
232 my $isleafclass = shift(@_);
233
234 my $optionlistref = $self->{'option_list'};
235 my @optionlist = @$optionlistref;
236 my $plugoutoptions = shift(@$optionlistref);
237 return if (!defined($plugoutoptions));
238
239 my $plugoutname = $plugoutoptions->{'name'};
240 my $plugoutargs = $plugoutoptions->{'args'};
241 my $plugoutdesc = $plugoutoptions->{'desc'};
242
243 # Produce the usage information using the data structure above
244 if ($isleafclass) {
245 if (defined($plugoutdesc)) {
246 gsprintf(STDERR, "$plugoutdesc\n\n");
247 }
248 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
249 }
250
251 # Display the download options, if there are some
252 if (defined($plugoutargs)) {
253 # Calculate the column offset of the option descriptions
254 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
255
256 if ($isleafclass) {
257 gsprintf(STDERR, " {common.specific_options}:\n");
258 }
259 else {
260 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
261 }
262
263 # Display the download options
264 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
265 }
266
267 # Recurse up the download hierarchy
268 $self->print_plugout_usage($descoffset, 0);
269 $self->{'option_list'} = \@optionlist;
270}
271
272
273sub error
274{
275 my ($strFunctionName,$strError) = @_;
276 {
[17202]277 print "Error occoured in BasePlugout.pm\n".
[12330]278 "In Function: ".$strFunctionName."\n".
279 "Error Message: ".$strError."\n";
280 exit(-1);
281 }
282}
283
284# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
285sub set_OIDtype {
286 my $self = shift (@_);
[12618]287 my ($type, $metadata) = @_;
[12330]288
289 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
290 $self->{'OIDtype'} = $type;
291 } else {
292 $self->{'OIDtype'} = "hash";
293 }
[12618]294 if ($type =~ /^assigned$/) {
295 if (defined $metadata) {
296 $self->{'OIDmetadata'} = $metadata;
297 } else {
298 $self->{'OIDmetadata'} = "dc.Identifier";
299 }
300 }
[12330]301}
302
303sub set_output_dir
304{
305 my $self = shift @_;
306 my ($output_dir) = @_;
307
308 $self->{'output_dir'} = $output_dir;
309}
310
311sub setoutputdir
312{
313 my $self = shift @_;
314 my ($output_dir) = @_;
315
316 $self->{'output_dir'} = $output_dir;
317}
318
319sub get_output_dir
320{
321 my $self = shift (@_);
322
323 return $self->{'output_dir'};
324}
325
326sub getoutputdir
327{
328 my $self = shift (@_);
329
330 return $self->{'output_dir'};
331}
332
333sub getoutputinfo
334{
335 my $self = shift (@_);
336
337 return $self->{'output_info'};
338}
339
340
341sub get_output_handler
342{
343 my $self = shift (@_);
344
345 my ($output_file_name) = @_;
346
347 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
348
349 return *OUTPUT;
350}
351
352sub release_output_handler
353{
354 my $self = shift (@_);
355 my ($outhandler) = @_;
356
357 close($outhandler);
358
359}
360
361sub output_xml_header {
362 my $self = shift (@_);
363 my ($handle,$docroot,$nondoctype) = @_;
364
365 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
366
367 if (!defined $nondoctype){
368 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
369 }
370
371 print $handle "<$docroot>\n" if defined $docroot;
372}
373
374sub output_xml_footer {
375 my $self = shift (@_);
376 my ($handle,$docroot) = @_;
377 print $handle "</$docroot>\n" if defined $docroot;
378}
379
380sub process {
381 my $self = shift (@_);
382 my ($doc_obj) = @_;
[12459]383
[12330]384 $doc_obj->set_lastmodified();
385
386 if ($self->{'group_size'} > 1) {
[12459]387 $self->group_process ($doc_obj);
[12330]388 return;
389 }
390
391 my $OID = $doc_obj->get_OID();
392 $OID = "NULL" unless defined $OID;
393
394 my $top_section = $doc_obj->get_top_section();
395
396 #get document's directory
397 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
398
399 my $output_info = $self->{'output_info'};
400 return if (!defined $output_info);
401
402 ##############################
403 # call subclass' saveas method
404 ##############################
405 $self->saveas($doc_obj,$doc_dir);
[18441]406 $self->archiveinf_gdbm($doc_obj,$doc_dir);
[17087]407
[12330]408}
409
[12363]410sub store_output_info_reference {
[12330]411 my $self = shift (@_);
412 my ($doc_obj) = @_;
413
[12363]414 my $output_info = $self->{'output_info'};
415 my $metaname = $self->{'sortmeta'};
416 if (!defined $metaname || $metaname !~ /\S/) {
417 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
418 return;
419 }
[12330]420
[12363]421 my $metadata = "";
422 my $top_section = $doc_obj->get_top_section();
423
424 my @commameta_list = split(/,/, $metaname);
425 foreach my $cmn (@commameta_list) {
426 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
427 if ($meta) {
428 # do remove prefix/suffix - this will apply to all values
429 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
430 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
431 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
432 $metadata .= $meta if ($meta);
433 }
[12330]434 }
435
436 # store reference in the output_info
437 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
438
439}
440
441sub group_process {
442
443 my $self = shift (@_);
444 my ($doc_obj) = @_;
445
446 my $OID = $doc_obj->get_OID();
447 $OID = "NULL" unless defined $OID;
448
449 my $groupsize = $self->{'group_size'};
450 my $gs_count = $self->{'gs_count'};
451 my $open_new_file = (($gs_count % $groupsize)==0);
452 my $outhandle = $self->{'output_handle'};
453
454 # opening a new file, or document has assoicated files => directory needed
[12459]455 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
456
[12330]457 # The directory the archive file (doc.xml) and all associated files
458 # should end up in
459 my $doc_dir;
460 # If we've determined its time for a new file, open it now
461 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
462 {
463 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
464 # only if opening new file
465 my $output_dir = $self->get_output_dir();
[12459]466 &util::mk_all_dir ($output_dir) unless -e $output_dir;
[12330]467 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
468 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
[12459]469
[12330]470 if ($gs_count>0)
471 {
472 return if (!$self->close_file_output());
473 }
474
[17202]475 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
[12330]476
477
478 $self->{'gs_filename'} = $doc_file;
[12459]479 $self->{'short_doc_file'} = $short_doc_file;
[12330]480 $self->{'gs_OID'} = $OID;
481 $self->{'gs_doc_dir'} = $doc_dir;
482
[17202]483 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
[12330]484 }
485 # Otherwise load the same archive document directory used last time
486 else
487 {
488 $doc_dir = $self->{'gs_doc_dir'};
489 }
490
491 # copy all the associated files, add this information as metadata
492 # to the document
[13172]493 print $outhandle "Writing associated files to $doc_dir\n";
[12330]494 $self->process_assoc_files ($doc_obj, $doc_dir);
[19494]495
496 # look up 'gsdlmetafile' metadata and store that information
497 # explicitly in $doc_obj
498 $self->process_metafiles_metadata ($doc_obj);
[12330]499 }
500
501 # save this document
[13172]502 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
503 print GROUPPROCESS $section_text;
[12330]504
505 $self->{'gs_count'}++;
506}
507
508
509sub saveas {
510 my $self = shift (@_);
511
512 die "Basplug::saveas function must be implemented in sub classes\n";
513}
514
515sub get_doc_dir {
516 my $self = shift (@_);
517 my ($OID, $source_filename) = @_;
518
519 my $working_dir = $self->get_output_dir();
[19775]520 my $working_info = $self->{'output_info'};
[12330]521 return if (!defined $working_info);
522
523 my $doc_info = $working_info->get_info($OID);
524 my $doc_dir = '';
525
[16252]526 if (defined $doc_info && scalar(@$doc_info) >= 1)
527 {
528 # This OID already has an archives directory, so use it again
[12330]529 $doc_dir = $doc_info->[0];
530 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
[16252]531 }
532 elsif ($self->{'keep_import_structure'})
533 {
[12330]534 $source_filename = &File::Basename::dirname($source_filename);
535 $source_filename =~ s/[\\\/]+/\//g;
536 $source_filename =~ s/\/$//;
537
538 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
[16252]539 }
[12330]540
[16252]541 # We have to use a new archives directory for this document
542 if ($doc_dir eq "")
543 {
544 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
[12330]545 }
546
[12603]547 if (!defined $self->{'group'} || !$self->{'group'}){
548 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
549 }
[16252]550
[12330]551 return $doc_dir;
552}
553
554sub get_new_doc_dir{
555 my $self = shift (@_);
556 my($working_info,$working_dir,$OID) = @_;
557
[19180]558
[12330]559 my $doc_dir = "";
560 my $doc_dir_rest = $OID;
[19180]561 # remove any \ and / from the OID
562 $doc_dir_rest =~ s/[\\\/]//g;
[12330]563 my $doc_dir_num = 0;
564
565 do {
566 $doc_dir .= "/" if $doc_dir_num > 0;
567 if ($doc_dir_rest =~ s/^(.{1,8})//) {
568 $doc_dir .= $1;
569 $doc_dir_num++;
570 }
571 } while ($doc_dir_rest ne "" &&
572 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
573 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
[17294]574 my $i = 1;
575 my $doc_dir_base = $doc_dir;
576 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
577 $doc_dir = "$doc_dir_base-$i";
578 $i++;
579 }
580
[16252]581 return "$doc_dir.dir";
[12330]582}
583
584sub process_assoc_files {
585 my $self = shift (@_);
586 my ($doc_obj, $doc_dir, $handle) = @_;
587
588 my $outhandle = $self->{'output_handle'};
589
590 my $output_dir = $self->get_output_dir();
591 return if (!defined $output_dir);
592
593 &util::mk_all_dir ($output_dir) unless -e $output_dir;
594
595 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
596 &util::mk_all_dir ($working_dir) unless -e $working_dir;
597
598 my @assoc_files = ();
599 my $filename;;
600
601 my $source_filename = $doc_obj->get_source_filename();
602
603 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
604
605 if (defined $collect_dir) {
606 my $dirsep_regexp = &util::get_os_dirsep();
607
608 if ($collect_dir !~ /$dirsep_regexp$/) {
609 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
610 }
611
612 # This test is never going to fail on Windows -- is this a problem?
613
614 if ($source_filename !~ /^$dirsep_regexp/) {
615 $source_filename = &util::filename_cat($collect_dir, $source_filename);
616 }
617 }
618
619
620 # set the assocfile path (even if we have no assoc files - need this for lucene)
621 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
622 "assocfilepath",
623 "$doc_dir");
624 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
625 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
626 $dir = "" unless defined $dir;
627
628
629 my $real_filename = $assoc_file_rec->[0];
630 # for some reasons the image associate file has / before the full path
631 $real_filename =~ s/^\\(.*)/$1/i;
632 if (-e $real_filename) {
633
634 $filename = &util::filename_cat($working_dir, $afile);
635
[18463]636 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
[12330]637
638 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
639 "gsdlassocfile",
640 "$afile:$assoc_file_rec->[2]:$dir");
641 } elsif ($self->{'verbosity'} > 2) {
[17202]642 print $outhandle "BasePlugout::process couldn't copy the associated file " .
[12330]643 "$real_filename to $afile\n";
644 }
645 }
646}
647
[17087]648
[19494]649sub process_metafiles_metadata
650{
651 my $self = shift (@_);
652 my ($doc_obj) = @_;
653
654 my $top_section = $doc_obj->get_top_section();
655 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
656
657 foreach my $metafile_pair (@$metafiles) {
[19516]658 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
[19494]659
660 $doc_obj->metadata_file($full_metafile,$metafile);
661 }
662
663 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
664}
665
666sub archiveinf_files_to_field
667{
668 my $self = shift(@_);
669 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
670
671 foreach my $file_rec (@$files) {
672 my $real_filename = $file_rec->[0];
673 my $full_file = $file_rec->[1];
674
675 # for some reasons the image associate file has / before the full path
676 $real_filename =~ s/^\\(.*)/$1/i;
677 if (-e $real_filename) {
678
679 if (defined $collect_dir) {
680 my $collect_dir_re_safe = $collect_dir;
681 $collect_dir_re_safe =~ s/\\/\\\\/g;
682 $collect_dir_re_safe =~ s/\./\\./g;
683
684 $real_filename =~ s/^$collect_dir_re_safe//;
685 }
686
687 $reverse_lookups->{$real_filename} = 1;
688
689 push(@{$oid_files->{$field}},$full_file);
690 }
691 else {
[19516]692 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
[19494]693 }
694 }
695}
696
[17087]697sub archiveinf_gdbm
698{
699 my $self = shift (@_);
700 my ($doc_obj) = @_;
701
702 my $verbosity = $self->{'verbosity'};
703
704 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
705 if (defined $collect_dir) {
706 my $dirsep_regexp = &util::get_os_dirsep();
707
708 if ($collect_dir !~ /$dirsep_regexp$/) {
709 # ensure there is a slash at the end
710 $collect_dir .= &util::get_dirsep();
711 }
712 }
713
714 my $oid = $doc_obj->get_OID();
[19829]715 my $source_filename = $doc_obj->get_unmodified_source_filename();
[17087]716
[18441]717 my $working_info = $self->{'output_info'};
718 my $doc_info = $working_info->get_info($oid);
719 my ($doc_file,$index_status) = @$doc_info;
720
721 my $oid_files = { 'doc-file' => $doc_file,
722 'index-status' => $index_status,
723 'src-file' => $source_filename,
[19775]724 'assoc-file' => [],
725 'meta-file' => [] };
[17087]726
[19494]727 my $reverse_lookups = { $source_filename => "1" };
[17087]728
729
[19775]730 $self->archiveinf_files_to_field($doc_obj->get_assoc_files(),"assoc-file",
[19494]731 $collect_dir,$oid_files,$reverse_lookups);
[17087]732
[19829]733# *******
[19494]734# foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
735# my $real_filename = $assoc_file_rec->[0];
736# my $full_afile = $assoc_file_rec->[1];
737#
738# # for some reasons the image associate file has / before the full path
739# $real_filename =~ s/^\\(.*)/$1/i;
740# if (-e $real_filename) {
741#
742# if (defined $collect_dir) {
743# my $collect_dir_re_safe = $collect_dir;
744# $collect_dir_re_safe =~ s/\\/\\\\/g;
745# $collect_dir_re_safe =~ s/\./\\./g;
746#
747# $real_filename =~ s/^$collect_dir_re_safe//;
748# }
749#
750# $reverse_lookups->{$real_filename} = 1;
751#
[19775]752# push(@{$oid_files->{'assoc-file'}},$full_afile);
[19494]753# }
754# else {
755# print STDERR "Warning: archiveinf_gdbm()\n $real_filename does not appear to be on the file system\n";
756# }
757# }
[17120]758
[19775]759 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
[19494]760 $collect_dir,$oid_files,$reverse_lookups);
[17087]761
762
763 # better not to commit to a particular db implementation, but
764 # for simplicity, will use GDBM for now.
765
766 my $output_dir = $self->{'output_dir'};
767
[18659]768 my $doc_db = &util::filename_cat($output_dir,"archiveinf-doc.gdb");
769 my $src_db = &util::filename_cat($output_dir,"archiveinf-src.gdb");
[17087]770
[19775]771# my $doc_db_text = "";
772# $doc_db_text .= "<doc-file>$oid_files->{'doc-file'}\n";
773# $doc_db_text .= "<index-status>$oid_files->{'index-status'}\n";
774# $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n";
[19494]775
[19775]776# foreach my $af (@{$oid_files->{'assoc-file'}}) {
777# $doc_db_text .= "<assoc-file>$af\n";
778# }
[19494]779
[19775]780# foreach my $mf (@{$oid_files->{'meta-file'}}) {
781# $doc_db_text .= "<meta-file>$mf\n";
782# }
[19494]783
[19775]784# chomp($doc_db_text); # remove trailing \n
[17087]785
[18528]786 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
[17087]787
[19775]788 ### &GDBMUtils::gdbmDatabaseSet($doc_db,$oid,$doc_db_text);
789
790 # switch to using GDBMUtils
791
792 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
793 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
794 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
795
796 my $infodb_file_handle
797 = &dbutil::open_infodb_write_handle_gdbm($doc_db,"append");
798 &dbutil::write_infodb_entry_gdbm($infodb_file_handle,$oid,$oid_files);
799 &dbutil::close_infodb_write_handle_gdbm($infodb_file_handle);
800
801
802
[19494]803 foreach my $rl (keys %$reverse_lookups) {
[19775]804 ## &GDBMUtils::gdbmDatabaseAppend($src_db,$rl,"<oid>$oid\n");
805 $working_info->add_reverseinfo($rl,$oid);
806 }
[17087]807}
808
809
[12330]810sub set_sortmeta {
811 my $self = shift (@_);
812 my ($sortmeta, $removeprefix, $removesuffix) = @_;
813
814 $self->{'sortmeta'} = $sortmeta;
815 if (defined ($removeprefix) && $removeprefix ) {
816 $removeprefix =~ s/^\^//; # don't need a leading ^
817 $self->{'removeprefix'} = $removeprefix;
818 }
819 if (defined ($removesuffix) && $removesuffix) {
820 $removesuffix =~ s/\$$//; # don't need a trailing $
821 $self->{'removesuffix'} = $removesuffix;
822 }
823}
824
825sub open_xslt_pipe
826{
827 my $self = shift @_;
828 my ($output_file_name, $xslt_file)=@_;
829
830 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
831
832 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java");
[13013]833
[15140]834 my $mapping_file_path = "";
835
[13225]836 if ($ENV{'GSDLOS'} eq "windows"){
837 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
[15140]838 $xslt_file = "\"file:///".$xslt_file."\"";
839 $mapping_file_path = "\"file:///";
[13225]840 }
841 else{
842 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
843 }
844
845
[13013]846 $java_class_path = "\"".$java_class_path."\"";
[13024]847
[14969]848 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t $xslt_file ";
[12330]849
[12603]850 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
[13064]851 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
[14969]852 $cmd .= "-m $mapping_file_path";
[12603]853 }
[14969]854
[12330]855 open(*XMLWRITER, $cmd)
856 or die "can't open pipe to xslt: $!";
857
858
859 $self->{'xslt_writer'} = *XMLWRITER;
860
861 print XMLWRITER "<?DocStart?>\n";
862 print XMLWRITER "$output_file_name\n";
[14969]863
[12330]864
865 }
866
867
868sub close_xslt_pipe
869{
870 my $self = shift @_;
871
872
873 return unless defined $self->{'xslt_writer'} ;
874
875 my $xsltwriter = $self->{'xslt_writer'};
876
877 print $xsltwriter "<?DocEnd?>\n";
878 close($xsltwriter);
[13024]879
880 undef $self->{'xslt_writer'};
881
[12330]882}
883
884sub close_file_output
885{
886 my ($self) = @_;
887
888 # make sure that the handle has been opened - it won't be if we failed
889 # to import any documents...
890 if (defined(fileno(GROUPPROCESS))) {
891 $self->output_xml_footer('GROUPPROCESS','Archive');
892 close GROUPPROCESS;
893 }
894
895 my $OID = $self->{'gs_OID'};
896 my $short_doc_file = $self->{'short_doc_file'};
897
898 if ($self->{'gzip'}) {
899 my $doc_file = $self->{'gs_filename'};
900 `gzip $doc_file`;
901 $doc_file .= ".gz";
902 $short_doc_file .= ".gz";
903 if (!-e $doc_file) {
904 my $outhandle = $self->{'output_handle'};
905 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
906 return 0;
907 }
908 }
909
910 # store reference in output_info
911 my $output_info = $self->{'output_info'};
912 return 0 if (!defined $output_info);
913 $output_info->add_info($OID, $short_doc_file, undef, undef);
914 return 1;
915}
916
[13172]917
[12330]918#the subclass should implement this method if is_group method could return 1.
919sub close_group_output{
920 my $self = shift (@_);
921}
922
923sub is_group {
924 my $self = shift (@_);
925 return 0;
926}
927
[13172]928my $dc_set = { Title => 1,
929 Creator => 1,
930 Subject => 1,
931 Description => 1,
932 Publisher => 1,
933 Contributor => 1,
934 Date => 1,
935 Type => 1,
936 Format => 1,
937 Identifier => 1,
938 Source => 1,
939 Language => 1,
940 Relation => 1,
941 Coverage => 1,
942 Rights => 1};
943
944
945# returns an XML representation of the dublin core metadata
946# if dc meta is not found, try ex mete
947sub get_dc_metadata {
948 my $self = shift(@_);
949 my ($doc_obj, $section, $version) = @_;
950
951 # build up string of dublin core metadata
952 $section="" unless defined $section;
953
954 my $section_ptr = $doc_obj->_lookup_section($section);
955 return "" unless defined $section_ptr;
956
957
958 my $explicit_dc = {};
959 my $explicit_ex = {};
960
961 my $all_text="";
962 foreach my $data (@{$section_ptr->{'metadata'}}){
963 my $escaped_value = &docprint::escape_text($data->[1]);
964 if ($data->[0]=~ m/^dc\./) {
965 $data->[0] =~ tr/[A-Z]/[a-z]/;
966
967 $data->[0] =~ m/^dc\.(.*)/;
968 my $dc_element = $1;
969
970 if (!defined $explicit_dc->{$dc_element}) {
971 $explicit_dc->{$dc_element} = [];
972 }
973 push(@{$explicit_dc->{$dc_element}},$escaped_value);
974
975 if (defined $version && ($version eq "oai_dc")) {
976 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
977 }
978 else {
979 # qualifier???
980 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
981 }
982
983 }
984 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
985 $data->[0] =~ m/^(ex\.)?(.*)/;
986 my $ex_element = $2;
987 my $lc_ex_element = lc($ex_element);
988
989 if (defined $dc_set->{$ex_element}) {
990 if (!defined $explicit_ex->{$lc_ex_element}) {
991 $explicit_ex->{$lc_ex_element} = [];
992 }
993 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
994 }
995 }
996 }
997
998 # go through dc_set and for any element *not* defined in explicit_dc
[14932]999 # that does exist in explicit_ex, add it in as metadata
[13172]1000 foreach my $k ( keys %$dc_set ) {
1001 my $lc_k = lc($k);
1002
1003 if (!defined $explicit_dc->{$lc_k}) {
1004 if (defined $explicit_ex->{$lc_k}) {
1005
1006 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1007 my $dc_element = $lc_k;
1008 my $escaped_value = $v;
1009
1010 if (defined $version && ($version eq "oai_dc")) {
1011 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1012 }
1013 else {
1014 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1015 }
1016
1017 }
1018 }
1019 }
1020 }
1021
1022 if ($all_text eq "") {
1023 $all_text .= " There is no Dublin Core metatdata in this document\n";
1024 }
1025 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1026
1027 return $all_text;
1028}
1029
1030# Build up dublin_core metadata. Priority given to dc.* over ex.*
1031# This method was apparently added by Jeffrey and committed by Shaoqun.
1032# But we don't know why it was added, so not using it anymore.
1033sub new_get_dc_metadata {
1034
1035 my $self = shift(@_);
1036 my ($doc_obj, $section, $version) = @_;
1037
1038 # build up string of dublin core metadata
1039 $section="" unless defined $section;
1040
1041 my $section_ptr=$doc_obj->_lookup_section($section);
1042 return "" unless defined $section_ptr;
1043
1044 my $all_text = "";
1045 foreach my $data (@{$section_ptr->{'metadata'}}){
1046 my $escaped_value = &docprint::escape_text($data->[1]);
1047 my $dc_element = $data->[0];
1048
1049 my @array = split('\.',$dc_element);
1050 my ($type,$name);
1051
1052 if(defined $array[1])
1053 {
1054 $type = $array[0];
1055 $name = $array[1];
1056 }
1057 else
1058 {
1059 $type = "ex";
1060 $name = $array[0];
1061 }
1062
1063 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1064 }
1065 return $all_text;
1066}
1067
1068
[12330]10691;
Note: See TracBrowser for help on using the repository browser.