root/main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm @ 26536

Revision 26536, 32.0 KB (checked in by davidb, 7 years ago)

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60     'deft' => "",
61    'hiddengli' => "no"},
62       { 'name' => "output_handle",
63    'desc' => "{BasPlugout.output_handle}",
64    'type' => "string",
65        'deft' =>  'STDERR',
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68       { 'name' => "verbosity",
69    'desc' => "{BasPlugout.verbosity}",
70    'type' => "int",
71        'deft' =>  "0",
72    'reqd' => "no", 
73        'hiddengli' => "no"},
74       { 'name' => "gzip_output",
75    'desc' => "{BasPlugout.gzip_output}",
76    'type' => "flag",
77    'reqd' => "no", 
78        'hiddengli' => "no"},
79       { 'name' => "debug",
80     'desc' => "{BasPlugout.debug}",
81     'type' => "flag",
82     'reqd' => "no",
83     'hiddengli' => "yes"}
84];
85
86my $options = { 'name'     => "BasePlugout",
87        'desc'     => "{BasPlugout.desc}",
88        'abstract' => "yes",
89        'inherits' => "no",
90        'args'     => $arguments};
91
92sub new
93{
94    my $class = shift (@_);
95
96    my ($plugoutlist,$args,$hashArgOptLists) = @_;
97    push(@$plugoutlist, $class);
98
99    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102    push(@{$hashArgOptLists->{"OptList"}},$options);
103
104    my $self = {};
105    $self->{'plugout_type'} = $class;
106    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107    $self->{"info_only"} = 0;
108
109    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110    # the args, just return the object. 
111    foreach my $strArg (@{$args})
112    {
113    if(defined $strArg && $strArg eq "-gsdlinfo")
114    {
115        $self->{"info_only"} = 1;
116        return bless $self, $class;
117    }
118    }
119   
120    delete $self->{"info_only"};
121   
122    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123    {
124    my $classTempClass = bless $self, $class;
125    print STDERR "<BadPlugout d=$plugout_name>\n";
126    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127    $classTempClass->print_txt_usage("");  # Use default resource bundle
128    die "\n";
129    }
130
131 
132    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
133    {
134    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135    if (!defined $full_file_path) {
136        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137        die "\n";
138    }
139    $self->{'xslt_file'} = $full_file_path;
140    }
141
142    $self->{'gs_count'} = 0;   
143
144    $self->{'keep_import_structure'} = 0;
145
146    return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152    my $self = shift(@_);
153    my $header = shift(@_);
154    my $high_level_information_only = shift(@_);
155   
156    # XML output is always in UTF-8
157    gsprintf::output_strings_in_UTF8;
158
159    if ($header) {
160    &PrintUsage::print_xml_header("plugout");
161    }
162    $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168    my $self = shift(@_);
169    my $high_level_information_only = shift(@_);
170
171    my $optionlistref = $self->{'option_list'};
172    my @optionlist = @$optionlistref;
173    my $plugoutoptions = shift(@$optionlistref);
174    return if (!defined($plugoutoptions));
175
176    gsprintf(STDERR, "<PlugoutInfo>\n");
177    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
178    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179    $desc =~ s/</&amp;lt;/g; # doubly escaped
180    $desc =~ s/>/&amp;gt;/g;
181    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
182    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184    unless (defined($high_level_information_only)) {
185    gsprintf(STDERR, "  <Arguments>\n");
186    if (defined($plugoutoptions->{'args'})) {
187        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188    }
189    gsprintf(STDERR, "  </Arguments>\n");
190
191    # Recurse up the plugout hierarchy
192    $self->print_xml();
193    }
194    gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200    my $self = shift(@_);
201
202    # Print the usage message for a plugout (recursively)
203    my $descoffset = $self->determine_description_offset(0);
204    $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209    my $self = shift(@_);
210    my $maxoffset = shift(@_);
211
212    my $optionlistref = $self->{'option_list'};
213    my @optionlist = @$optionlistref;
214    my $plugoutoptions = pop(@$optionlistref);
215    return $maxoffset if (!defined($plugoutoptions));
216
217    # Find the length of the longest option string of this download
218    my $plugoutargs = $plugoutoptions->{'args'};
219    if (defined($plugoutargs)) {
220    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221    if ($longest > $maxoffset) {
222        $maxoffset = $longest;
223    }
224    }
225
226    # Recurse up the download hierarchy
227    $maxoffset = $self->determine_description_offset($maxoffset);
228    $self->{'option_list'} = \@optionlist;
229    return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235    my $self = shift(@_);
236    my $descoffset = shift(@_);
237    my $isleafclass = shift(@_);
238
239    my $optionlistref = $self->{'option_list'};
240    my @optionlist = @$optionlistref;
241    my $plugoutoptions = shift(@$optionlistref);
242    return if (!defined($plugoutoptions));
243
244    my $plugoutname = $plugoutoptions->{'name'};
245    my $plugoutargs = $plugoutoptions->{'args'};
246    my $plugoutdesc = $plugoutoptions->{'desc'};
247
248    # Produce the usage information using the data structure above
249    if ($isleafclass) {
250    if (defined($plugoutdesc)) {
251        gsprintf(STDERR, "$plugoutdesc\n\n");
252    }
253    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254    }
255
256    # Display the download options, if there are some
257    if (defined($plugoutargs)) {
258    # Calculate the column offset of the option descriptions
259    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
260
261    if ($isleafclass) {
262        gsprintf(STDERR, " {common.specific_options}:\n");
263    }
264    else {
265        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266    }
267
268    # Display the download options
269    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270    }
271
272    # Recurse up the download hierarchy
273    $self->print_plugout_usage($descoffset, 0);
274    $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280      my ($strFunctionName,$strError) = @_;
281    {
282    print "Error occoured in BasePlugout.pm\n".
283        "In Function: ".$strFunctionName."\n".
284        "Error Message: ".$strError."\n";
285    exit(-1);
286    } 
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291    my $self = shift (@_);
292    my ($type, $metadata) = @_;
293
294    if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) {
295    $self->{'OIDtype'} = $type;
296    } else {
297    $self->{'OIDtype'} = "hash";
298    }
299    if ($type =~ /^assigned$/) {
300    if (defined $metadata) {
301        $self->{'OIDmetadata'} = $metadata;
302    } else {
303        $self->{'OIDmetadata'} = "dc.Identifier";
304    }
305    }
306}
307
308sub set_output_dir
309{
310    my $self = shift @_;
311    my ($output_dir) = @_;
312
313   $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318    my $self = shift @_;
319    my ($output_dir) = @_;
320
321    $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326    my $self = shift (@_);
327
328   return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333    my $self = shift (@_);
334
335    return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340    my $self = shift (@_);
341
342    return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348    my $self = shift (@_);
349
350    my ($output_file_name) = @_;
351     
352    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
353
354    return *OUTPUT;           
355}
356
357sub release_output_handler
358{
359    my $self = shift (@_);
360    my ($outhandler) = @_;
361
362    close($outhandler);
363
364}
365
366sub output_xml_header {
367    my $self = shift (@_);
368    my ($handle,$docroot,$nondoctype) = @_;
369   
370   
371    #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372   
373    #For Dspace must be UTF in lower case
374    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
375   
376    if (!defined $nondoctype){
377    my $doctype = (defined $docroot) ? $docroot : "Section";
378
379    # Used to be '<!DOCTYPE Archive SYSTEM ...'
380   
381    print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
382    }
383
384    print $handle "<$docroot>\n" if defined $docroot;
385}
386
387sub output_xml_footer {
388    my $self = shift (@_);
389    my ($handle,$docroot) = @_;
390    print $handle "</$docroot>\n" if defined $docroot;
391}
392
393
394sub output_general_xml_header
395{
396    my $self = shift (@_);
397    my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
398   
399    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
400   
401    if (defined $opt_dtd) {
402        my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
403    print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
404    }
405
406    if (defined $docroot) {
407        my $full_docroot = $docroot;
408        if (defined $opt_attributes) {
409          $full_docroot .= " $opt_attributes";
410        }
411
412        print $handle "<$full_docroot>\n"
413      }
414}
415
416sub output_general_xml_footer
417{
418    output_xml_footer(@_);
419}
420
421
422sub process {
423    my $self = shift (@_);
424    my ($doc_obj) = @_;
425   
426    # for OAI purposes
427    $doc_obj->set_lastmodified();
428    $doc_obj->set_oailastmodified();
429
430     if ($self->{'group_size'} > 1) {
431     $self->group_process ($doc_obj);
432    return;
433    }
434
435    my $OID = $doc_obj->get_OID();
436    $OID = "NULL" unless defined $OID;     
437
438    my $top_section = $doc_obj->get_top_section();
439
440    #get document's directory
441    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
442   
443    my $output_info = $self->{'output_info'};
444    return if (!defined $output_info);
445     
446    ##############################
447    # call subclass' saveas method
448    ##############################
449    $self->saveas($doc_obj,$doc_dir);
450    $self->archiveinf_db($doc_obj,$doc_dir);
451
452}
453
454sub store_output_info_reference {
455    my $self = shift (@_);
456    my ($doc_obj) = @_;
457
458    my $output_info = $self->{'output_info'};
459    my $metaname = $self->{'sortmeta'};
460    if (!defined $metaname || $metaname !~ /\S/) {
461    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
462    return;
463    }
464   
465    my $metadata = "";
466    my $top_section = $doc_obj->get_top_section();
467   
468    my @commameta_list = split(/,/, $metaname);
469    foreach my $cmn (@commameta_list) {
470    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
471    if ($meta) {
472        # do remove prefix/suffix - this will apply to all values
473        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
474        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
475        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
476        $metadata .= $meta if ($meta);
477    }
478    }
479
480    # store reference in the output_info     
481    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
482   
483}
484
485sub group_process {
486
487    my $self = shift (@_);
488    my ($doc_obj) = @_;
489   
490    my $OID = $doc_obj->get_OID();
491    $OID = "NULL" unless defined $OID;
492
493    my $groupsize = $self->{'group_size'};
494    my $gs_count = $self->{'gs_count'};
495    my $open_new_file = (($gs_count % $groupsize)==0);
496    my $outhandle = $self->{'output_handle'};
497
498    # opening a new file, or document has assoicated files => directory needed
499    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
500         
501        # The directory the archive file (doc.xml) and all associated files
502        # should end up in
503        my $doc_dir;
504        # If we've determined its time for a new file, open it now
505        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
506          {
507            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
508            # only if opening new file
509        my $output_dir = $self->get_output_dir();
510        &util::mk_all_dir ($output_dir) unless -e $output_dir;
511        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
512        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
513       
514        if ($gs_count>0)
515        {
516        return if (!$self->close_file_output());
517        }
518
519        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
520           
521
522        $self->{'gs_filename'} = $doc_file;
523        $self->{'short_doc_file'} = $short_doc_file;
524        $self->{'gs_OID'} = $OID;
525            $self->{'gs_doc_dir'} = $doc_dir;
526
527        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
528    }
529        # Otherwise load the same archive document directory used last time
530        else
531          {
532            $doc_dir = $self->{'gs_doc_dir'};
533          }
534
535    # copy all the associated files, add this information as metadata
536    # to the document
537        print $outhandle "Writing associated files to $doc_dir\n";
538    $self->process_assoc_files ($doc_obj, $doc_dir);
539
540    # look up 'gsdlmetafile' metadata and store that information
541    # explicitly in $doc_obj
542    $self->process_metafiles_metadata ($doc_obj);
543    }
544
545    # save this document
546    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
547    print GROUPPROCESS $section_text;
548
549    $self->{'gs_count'}++;
550}
551
552
553sub saveas {
554    my $self = shift (@_);
555   
556    die "Basplug::saveas function must be implemented in sub classes\n";
557}
558
559sub get_doc_dir {
560    my $self = shift (@_);
561    my ($OID, $source_filename) = @_;
562
563    my $working_dir  = $self->get_output_dir();
564    my $working_info = $self->{'output_info'};
565    return if (!defined $working_info);
566
567    my $doc_info = $working_info->get_info($OID);
568    my $doc_dir = '';
569
570    if (defined $doc_info && scalar(@$doc_info) >= 1)
571    {
572    # This OID already has an archives directory, so use it again
573    $doc_dir = $doc_info->[0];
574    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
575    }
576    elsif ($self->{'keep_import_structure'})
577    {
578    $source_filename = &File::Basename::dirname($source_filename);
579    $source_filename =~ s/[\\\/]+/\//g;
580    $source_filename =~ s/\/$//;
581
582        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
583    }
584
585    # We have to use a new archives directory for this document
586    if ($doc_dir eq "")
587    {
588    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
589    }
590
591    if (!defined $self->{'group'} || !$self->{'group'}){
592    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
593    }
594
595    return $doc_dir;
596}
597
598sub get_new_doc_dir{
599   my $self = shift (@_); 
600   my($working_info,$working_dir,$OID) = @_;     
601   
602   
603   my $doc_dir = "";
604   my $doc_dir_rest = $OID;
605
606   # remove any \ and / from the OID
607   $doc_dir_rest =~ s/[\\\/]//g;
608
609   # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
610   $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
611
612   my $doc_dir_num = 0;
613
614   do {
615       $doc_dir .= "/" if $doc_dir_num > 0;
616       if ($doc_dir_rest =~ s/^(.{1,8})//) {
617       $doc_dir .= $1;
618       $doc_dir_num++;
619       }
620   } while ($doc_dir_rest ne "" &&
621        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
622         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
623   my $i = 1;
624   my $doc_dir_base = $doc_dir;
625   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
626       $doc_dir = "$doc_dir_base-$i";
627       $i++;
628   }
629         
630   return "$doc_dir.dir";
631}
632
633sub process_assoc_files {
634    my $self = shift (@_);
635    my ($doc_obj, $doc_dir, $handle) = @_;
636
637    my $outhandle = $self->{'output_handle'};
638   
639    my $output_dir = $self->get_output_dir();
640    return if (!defined $output_dir);
641
642    &util::mk_all_dir ($output_dir) unless -e $output_dir;
643     
644    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
645    &util::mk_all_dir ($working_dir) unless -e $working_dir;
646
647    my @assoc_files = ();
648    my $filename;;
649
650    my $source_filename = $doc_obj->get_source_filename();
651
652    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
653
654    if (defined $collect_dir) {
655    my $dirsep_regexp = &util::get_os_dirsep();
656
657    if ($collect_dir !~ /$dirsep_regexp$/) {
658        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
659    }
660
661    # This test is never going to fail on Windows -- is this a problem?
662     
663    if ($source_filename !~ /^$dirsep_regexp/) {
664        $source_filename = &util::filename_cat($collect_dir, $source_filename);
665    }
666    }
667
668
669    # set the assocfile path (even if we have no assoc files - need this for lucene)
670    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
671                     "assocfilepath",
672                     "$doc_dir");
673    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
674    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
675    $dir = "" unless defined $dir;
676       
677    my $utf8_real_filename = $assoc_file_rec->[0];
678
679    # for some reasons the image associate file has / before the full path
680    $utf8_real_filename =~ s/^\\(.*)/$1/i;
681
682##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
683    my $real_filename = $utf8_real_filename;
684    $real_filename = &util::downgrade_if_dos_filename($real_filename);
685
686    if (-e $real_filename) {
687
688        $filename = &util::filename_cat($working_dir, $afile);
689
690        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
691       
692        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
693                     "gsdlassocfile",
694                     "$afile:$assoc_file_rec->[2]:$dir");
695    } elsif ($self->{'verbosity'} > 1) {
696        print $outhandle "BasePlugout::process couldn't copy the associated file " .
697        "$real_filename to $afile\n";
698    }
699    }
700}
701
702
703sub process_metafiles_metadata
704{
705    my $self = shift (@_);
706    my ($doc_obj) = @_;
707
708    my $top_section = $doc_obj->get_top_section();
709    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
710
711    foreach my $metafile_pair (@$metafiles) {
712    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
713
714    $doc_obj->metadata_file($full_metafile,$metafile);
715    }
716
717    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
718}
719
720sub archiveinf_files_to_field
721{
722    my $self = shift(@_);
723    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
724
725    foreach my $file_rec (@$files) {
726    my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
727    my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
728    # for some reasons the image associate file has / before the full path
729    $real_filename =~ s/^\\(.*)/$1/i;
730
731    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
732
733    if (-e $raw_filename) {
734
735#       if (defined $collect_dir) {
736#       my $collect_dir_re_safe = $collect_dir;
737#       $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
738#       $collect_dir_re_safe =~ s/\./\\./g;##
739
740#       $real_filename =~ s/^$collect_dir_re_safe//;
741#       }
742       
743        if (defined $reverse_lookups) {
744        $reverse_lookups->{$real_filename} = 1;
745        }
746###     push(@{$oid_files->{$field}},$full_file);
747        push(@{$oid_files->{$field}},$raw_filename);
748    }
749    else {
750        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
751    }
752    }
753}
754
755sub archiveinf_db
756{
757    my $self = shift (@_);
758    my ($doc_obj) = @_;
759
760    my $verbosity = $self->{'verbosity'};
761
762    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
763    if (defined $collect_dir) {
764    my $dirsep_regexp = &util::get_os_dirsep();
765
766    if ($collect_dir !~ /$dirsep_regexp$/) {
767        # ensure there is a slash at the end
768        $collect_dir .= &util::get_dirsep();
769    }
770    }
771
772    my $oid = $doc_obj->get_OID();
773    my $source_filename = $doc_obj->get_unmodified_source_filename();
774    my $working_info = $self->{'output_info'};
775    my $doc_info = $working_info->get_info($oid);
776
777    my ($doc_file,$index_status,$sortmeta) = @$doc_info;
778    # doc_file is the path to the archive doc.xml. Make sure it has unix
779    # slashes, then if the collection is copied to linux, it can be built without reimport
780    $doc_file =~ s/\\/\//g;
781    my $oid_files = { 'doc-file' => $doc_file,
782              'index-status' => $index_status,
783              'src-file' => $source_filename,
784              'sort-meta' => $sortmeta,
785              'assoc-file' => [],
786              'meta-file'  => [] };
787   
788    my $reverse_lookups = { $source_filename => "1" };
789
790
791    $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
792                     $collect_dir,$oid_files,$reverse_lookups);
793
794
795    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
796                     $collect_dir,$oid_files);
797
798    # Get the infodbtype value for this collection from the arcinfo object
799    my $infodbtype = $self->{'output_info'}->{'infodbtype'};
800    my $output_dir = $self->{'output_dir'};
801
802    my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
803    my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
804
805    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
806
807    if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
808    my $top_section = $doc_obj->get_top_section();
809
810    my $dc_titles = $doc_obj->get_metadata($top_section,"dls.Title");
811    my $dc_title = join("; ", @$dc_titles);
812
813    if ($oid_files->{'index-status'} eq "R") {
814        $dc_title .= " (Updated)";
815    }
816
817    my $rss_filename = &util::filename_cat($output_dir,"rss-items.rdf");
818    if (open(RSSOUT,">>$rss_filename")) {
819        print RSSOUT "<item>\n";
820        print RSSOUT "   <title>$dc_title</title>\n";
821        print RSSOUT "   <link>_httpdomain__httpcollection_/document/$oid</link>\n";
822        print RSSOUT "</item>\n";
823        close(RSSOUT);
824    }
825    else {
826        print STDERR "**** Failed to open $rss_filename\n!$\n";
827    }
828
829
830    }
831
832    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
833    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
834    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
835    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
836
837    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
838    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
839    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
840
841    foreach my $rl (keys %$reverse_lookups) {
842    $working_info->add_reverseinfo($rl,$oid);
843    } 
844
845    # meta files not set in reverese entry, but need to set the metadata flag
846    if (defined $doc_obj->get_meta_files()) {
847    foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
848        my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
849        $working_info->set_meta_file_flag($full_file);
850    }
851    }
852}
853
854
855sub set_sortmeta {
856    my $self = shift (@_);
857    my ($sortmeta, $removeprefix, $removesuffix) = @_;
858   
859    $self->{'sortmeta'} = $sortmeta;
860    if (defined ($removeprefix) && $removeprefix ) {
861    $removeprefix =~ s/^\^//; # don't need a leading ^
862    $self->{'removeprefix'} = $removeprefix;
863    }
864    if (defined ($removesuffix) && $removesuffix) {
865    $removesuffix =~ s/\$$//; # don't need a trailing $
866    $self->{'removesuffix'} = $removesuffix;
867    }
868}
869
870sub open_xslt_pipe
871{
872    my $self = shift @_;
873    my ($output_file_name, $xslt_file)=@_;
874
875    return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
876   
877    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
878
879    my $mapping_file_path = "";
880
881    if ($ENV{'GSDLOS'} eq "windows"){
882    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
883    # this file:/// bit didn't work for me on windows XP
884    #$xslt_file = "\"file:///".$xslt_file."\"";
885    #$mapping_file_path = "\"file:///";
886    }
887    else{
888    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
889    }
890
891
892    $java_class_path = "\"".$java_class_path."\"";
893
894    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
895
896    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
897    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
898    $cmd .= "-m $mapping_file_path";
899    }
900   
901    open(*XMLWRITER, $cmd)
902    or die "can't open pipe to xslt: $!";
903
904   
905    $self->{'xslt_writer'} = *XMLWRITER;
906
907    print XMLWRITER "<?DocStart?>\n";       
908    print XMLWRITER "$output_file_name\n";
909
910 
911  }
912 
913
914sub close_xslt_pipe
915{
916  my $self = shift @_;
917
918 
919  return unless defined $self->{'xslt_writer'} ;
920   
921  my $xsltwriter = $self->{'xslt_writer'};
922 
923  print $xsltwriter "<?DocEnd?>\n";
924  close($xsltwriter);
925
926  undef $self->{'xslt_writer'};
927
928}
929
930sub close_file_output
931{
932    my ($self) = @_;
933 
934    # make sure that the handle has been opened - it won't be if we failed
935    # to import any documents...
936    if (defined(fileno(GROUPPROCESS))) {
937    $self->output_xml_footer('GROUPPROCESS','Archive');   
938    close GROUPPROCESS;
939    }
940
941    my $OID = $self->{'gs_OID'};
942    my $short_doc_file = $self->{'short_doc_file'};
943   
944    if ($self->{'gzip'}) {
945    my $doc_file = $self->{'gs_filename'};
946    `gzip $doc_file`;
947    $doc_file .= ".gz";
948    $short_doc_file .= ".gz";
949    if (!-e $doc_file) {
950         my $outhandle = $self->{'output_handle'};
951        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
952        return 0;
953    }
954    }
955
956    # store reference in output_info
957    my $output_info = $self->{'output_info'};
958    return 0 if (!defined $output_info);
959    $output_info->add_info($OID, $short_doc_file, undef, undef);
960    return 1;
961}
962
963
964#the subclass should implement this method if is_group method could return 1.
965sub close_group_output{
966   my $self = shift (@_);       
967}
968
969sub is_group {
970    my $self = shift (@_);
971    return 0;       
972}
973
974my $dc_set = { Title => 1,       
975           Creator => 1,
976           Subject => 1,
977           Description => 1,
978           Publisher => 1,
979           Contributor => 1,
980           Date => 1,
981           Type => 1,
982           Format => 1,
983           Identifier => 1,
984           Source => 1,
985           Language => 1,
986           Relation => 1,
987           Coverage => 1,
988           Rights => 1};
989
990
991# returns an XML representation of the dublin core metadata
992# if dc meta is not found, try ex meta
993# This method is not used by the DSpacePlugout, which has its
994# own method to save its dc metadata
995sub get_dc_metadata {
996    my $self = shift(@_);
997    my ($doc_obj, $section, $version) = @_;
998   
999    # build up string of dublin core metadata
1000    $section="" unless defined $section;
1001   
1002    my $section_ptr = $doc_obj->_lookup_section($section);
1003    return "" unless defined $section_ptr;
1004
1005
1006    my $explicit_dc = {};
1007    my $explicit_ex_dc = {};
1008    my $explicit_ex = {};
1009
1010    my $all_text="";
1011   
1012    # We want high quality dc metadata to go in first, so we store all the
1013    # assigned dc.* values first. Then, for all those dc metadata names in
1014    # the official dc set that are as yet unassigned, we look to see whether
1015    # embedded ex.dc.* metadata has defined some values for them. If not,
1016    # then for the same missing dc metadata names, we look in ex metadata.
1017
1018    foreach my $data (@{$section_ptr->{'metadata'}}){
1019    my $escaped_value = &docprint::escape_text($data->[1]);
1020    if ($data->[0]=~ m/^dc\./) {
1021        $data->[0] =~ tr/[A-Z]/[a-z]/;
1022
1023        $data->[0] =~ m/^dc\.(.*)/;
1024        my $dc_element =  $1;
1025
1026        if (!defined $explicit_dc->{$dc_element}) {
1027        $explicit_dc->{$dc_element} = [];
1028        }
1029        push(@{$explicit_dc->{$dc_element}},$escaped_value);
1030
1031        if (defined $version && ($version eq "oai_dc")) {
1032        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1033        }
1034        else {
1035        # qualifier???
1036        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1037        }
1038
1039    } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1040        $data->[0] =~ m/^ex\.dc\.(.*)/;
1041        my $ex_dc_element = $1;
1042        my $lc_ex_dc_element = lc($ex_dc_element);
1043
1044        # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1045        if (defined $dc_set->{$ex_dc_element}) {
1046        if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1047            $explicit_ex_dc->{$lc_ex_dc_element} = [];
1048        }
1049        push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1050        }
1051    }
1052    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1053        $data->[0] =~ m/^(ex\.)?(.*)/;
1054        my $ex_element = $2;
1055        my $lc_ex_element = lc($ex_element);
1056
1057        if (defined $dc_set->{$ex_element}) {
1058        if (!defined $explicit_ex->{$lc_ex_element}) {
1059            $explicit_ex->{$lc_ex_element} = [];
1060        }
1061        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1062        }
1063    }
1064    }
1065
1066    # go through dc_set and for any element *not* defined in explicit_dc
1067    # that does exist in explicit_ex, add it in as metadata
1068    foreach my $k ( keys %$dc_set ) {
1069    my $lc_k = lc($k);
1070
1071    if (!defined $explicit_dc->{$lc_k}) {
1072        # try to find if ex.dc.* defines this dc.* meta,
1073        # if not, then look for whether there's an ex.* equivalent
1074
1075        if (defined $explicit_ex_dc->{$lc_k}) {
1076        foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1077            my $dc_element    = $lc_k;
1078            my $escaped_value = $v;
1079           
1080            if (defined $version && ($version eq "oai_dc")) {
1081            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1082            }
1083            else {
1084            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1085            }           
1086        }
1087        } elsif (defined $explicit_ex->{$lc_k}) {
1088        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1089            my $dc_element    = $lc_k;
1090            my $escaped_value = $v;
1091
1092            if (defined $version && ($version eq "oai_dc")) {
1093            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1094            }
1095            else {
1096            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1097            }
1098        }
1099        }
1100    }
1101    }
1102
1103    if ($all_text eq "") {
1104    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1105    }   
1106    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1107
1108    return $all_text;
1109}
1110
1111# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1112# This method was apparently added by Jeffrey and committed by Shaoqun.
1113# But we don't know why it was added, so not using it anymore.
1114sub new_get_dc_metadata {
1115   
1116    my $self = shift(@_);
1117    my ($doc_obj, $section, $version) = @_;
1118
1119    # build up string of dublin core metadata
1120    $section="" unless defined $section;
1121   
1122    my $section_ptr=$doc_obj->_lookup_section($section);
1123    return "" unless defined $section_ptr;
1124
1125    my $all_text = "";
1126    foreach my $data (@{$section_ptr->{'metadata'}}){
1127    my $escaped_value = &docprint::escape_text($data->[1]);
1128    my $dc_element =  $data->[0];
1129   
1130    my @array = split('\.',$dc_element);
1131    my ($type,$name);
1132
1133    if(defined $array[1])
1134    {
1135        $type = $array[0];
1136        $name = $array[1];
1137    }
1138    else
1139    {
1140        $type = "ex";
1141        $name = $array[0];
1142    }
1143   
1144    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1145    }
1146    return $all_text;
1147}
1148
1149
11501;
Note: See TracBrowser for help on using the browser.