root/main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm @ 27500

Revision 27500, 32.9 KB (checked in by jmt12, 6 years ago)

Missed an old style file open that instead needs to go through FileUtils::openFileHandle()

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46       { 'name' => "group_size",
47    'desc' => "{BasPlugout.group_size}",
48    'type' => "int",
49        'deft' =>  "1",
50    'reqd' => "no",
51    'hiddengli' => "no"},
52       { 'name' => "output_info",
53    'desc' => "{BasPlugout.output_info}",
54    'type' => "string",   
55    'reqd' => "yes",
56    'hiddengli' => "yes"},       
57       { 'name' => "xslt_file",
58    'desc' => "{BasPlugout.xslt_file}",
59    'type' => "string",
60    'reqd' => "no",
61     'deft' => "",
62    'hiddengli' => "no"},
63       { 'name' => "output_handle",
64    'desc' => "{BasPlugout.output_handle}",
65    'type' => "string",
66        'deft' =>  'STDERR',
67    'reqd' => "no",
68    'hiddengli' => "yes"},
69       { 'name' => "verbosity",
70    'desc' => "{BasPlugout.verbosity}",
71    'type' => "int",
72        'deft' =>  "0",
73    'reqd' => "no", 
74        'hiddengli' => "no"},
75       { 'name' => "gzip_output",
76    'desc' => "{BasPlugout.gzip_output}",
77    'type' => "flag",
78    'reqd' => "no", 
79        'hiddengli' => "no"},
80       { 'name' => "debug",
81     'desc' => "{BasPlugout.debug}",
82     'type' => "flag",
83     'reqd' => "no",
84     'hiddengli' => "yes"},
85       { 'name' => "subdir_split_length",
86     'desc' => "{BasPlugout.subdir_split_length}",
87     'type' => "int",
88     'reqd' => "no",
89         'deft' => "8",
90     'hiddengli' => "no"},
91       { 'name' => "subdir_hash_prefix",
92     'desc' => "{BasPlugout.subdir_hash_prefix}",
93     'type' => "flag",
94     'reqd' => "no",
95         'deft' => "no",
96     'hiddengli' => "no"}
97];
98
99my $options = { 'name'     => "BasePlugout",
100        'desc'     => "{BasPlugout.desc}",
101        'abstract' => "yes",
102        'inherits' => "no",
103        'args'     => $arguments};
104
105sub new
106{
107    my $class = shift (@_);
108
109    my ($plugoutlist,$args,$hashArgOptLists) = @_;
110    push(@$plugoutlist, $class);
111
112    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
113
114    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115    push(@{$hashArgOptLists->{"OptList"}},$options);
116
117    my $self = {};
118    $self->{'plugout_type'} = $class;
119    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
120    $self->{"info_only"} = 0;
121
122    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
123    # the args, just return the object. 
124    foreach my $strArg (@{$args})
125    {
126    if(defined $strArg && $strArg eq "-gsdlinfo")
127    {
128        $self->{"info_only"} = 1;
129        return bless $self, $class;
130    }
131    }
132   
133    delete $self->{"info_only"};
134   
135    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
136    {
137    my $classTempClass = bless $self, $class;
138    print STDERR "<BadPlugout d=$plugout_name>\n";
139    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
140    $classTempClass->print_txt_usage("");  # Use default resource bundle
141    die "\n";
142    }
143
144 
145    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
146    {
147    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
148    if (!defined $full_file_path) {
149        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
150        die "\n";
151    }
152    $self->{'xslt_file'} = $full_file_path;
153    }
154
155    $self->{'gs_count'} = 0;   
156
157    $self->{'keep_import_structure'} = 0;
158
159    return bless $self, $class;
160
161}
162
163sub print_xml_usage
164{
165    my $self = shift(@_);
166    my $header = shift(@_);
167    my $high_level_information_only = shift(@_);
168   
169    # XML output is always in UTF-8
170    gsprintf::output_strings_in_UTF8;
171
172    if ($header) {
173    &PrintUsage::print_xml_header("plugout");
174    }
175    $self->print_xml($high_level_information_only);
176}
177
178
179sub print_xml
180{
181    my $self = shift(@_);
182    my $high_level_information_only = shift(@_);
183
184    my $optionlistref = $self->{'option_list'};
185    my @optionlist = @$optionlistref;
186    my $plugoutoptions = shift(@$optionlistref);
187    return if (!defined($plugoutoptions));
188
189    gsprintf(STDERR, "<PlugoutInfo>\n");
190    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
191    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
192    $desc =~ s/</&amp;lt;/g; # doubly escaped
193    $desc =~ s/>/&amp;gt;/g;
194    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
195    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
196    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
197    unless (defined($high_level_information_only)) {
198    gsprintf(STDERR, "  <Arguments>\n");
199    if (defined($plugoutoptions->{'args'})) {
200        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
201    }
202    gsprintf(STDERR, "  </Arguments>\n");
203
204    # Recurse up the plugout hierarchy
205    $self->print_xml();
206    }
207    gsprintf(STDERR, "</PlugoutInfo>\n");
208}
209
210
211sub print_txt_usage
212{
213    my $self = shift(@_);
214
215    # Print the usage message for a plugout (recursively)
216    my $descoffset = $self->determine_description_offset(0);
217    $self->print_plugout_usage($descoffset, 1);
218}
219
220sub determine_description_offset
221{
222    my $self = shift(@_);
223    my $maxoffset = shift(@_);
224
225    my $optionlistref = $self->{'option_list'};
226    my @optionlist = @$optionlistref;
227    my $plugoutoptions = pop(@$optionlistref);
228    return $maxoffset if (!defined($plugoutoptions));
229
230    # Find the length of the longest option string of this download
231    my $plugoutargs = $plugoutoptions->{'args'};
232    if (defined($plugoutargs)) {
233    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
234    if ($longest > $maxoffset) {
235        $maxoffset = $longest;
236    }
237    }
238
239    # Recurse up the download hierarchy
240    $maxoffset = $self->determine_description_offset($maxoffset);
241    $self->{'option_list'} = \@optionlist;
242    return $maxoffset;
243}
244
245
246sub print_plugout_usage
247{
248    my $self = shift(@_);
249    my $descoffset = shift(@_);
250    my $isleafclass = shift(@_);
251
252    my $optionlistref = $self->{'option_list'};
253    my @optionlist = @$optionlistref;
254    my $plugoutoptions = shift(@$optionlistref);
255    return if (!defined($plugoutoptions));
256
257    my $plugoutname = $plugoutoptions->{'name'};
258    my $plugoutargs = $plugoutoptions->{'args'};
259    my $plugoutdesc = $plugoutoptions->{'desc'};
260
261    # Produce the usage information using the data structure above
262    if ($isleafclass) {
263    if (defined($plugoutdesc)) {
264        gsprintf(STDERR, "$plugoutdesc\n\n");
265    }
266    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
267    }
268
269    # Display the download options, if there are some
270    if (defined($plugoutargs)) {
271    # Calculate the column offset of the option descriptions
272    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
273
274    if ($isleafclass) {
275        gsprintf(STDERR, " {common.specific_options}:\n");
276    }
277    else {
278        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
279    }
280
281    # Display the download options
282    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
283    }
284
285    # Recurse up the download hierarchy
286    $self->print_plugout_usage($descoffset, 0);
287    $self->{'option_list'} = \@optionlist;
288}
289
290
291sub error
292{
293      my ($strFunctionName,$strError) = @_;
294    {
295    print "Error occoured in BasePlugout.pm\n".
296        "In Function: ".$strFunctionName."\n".
297        "Error Message: ".$strError."\n";
298    exit(-1);
299    } 
300}
301
302# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
303sub set_OIDtype {
304    my $self = shift (@_);
305    my ($type, $metadata) = @_;
306
307    if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
308    $self->{'OIDtype'} = $type;
309    } else {
310    $self->{'OIDtype'} = "hash";
311    }
312    if ($type =~ /^assigned$/) {
313    if (defined $metadata) {
314        $self->{'OIDmetadata'} = $metadata;
315    } else {
316        $self->{'OIDmetadata'} = "dc.Identifier";
317    }
318    }
319}
320
321sub set_output_dir
322{
323    my $self = shift @_;
324    my ($output_dir) = @_;
325
326   $self->{'output_dir'} = $output_dir;
327}
328
329sub setoutputdir
330{
331    my $self = shift @_;
332    my ($output_dir) = @_;
333
334    $self->{'output_dir'} = $output_dir;
335}
336
337sub get_output_dir
338{
339    my $self = shift (@_);
340
341   return $self->{'output_dir'};
342}
343
344sub getoutputdir
345{
346    my $self = shift (@_);
347
348    return $self->{'output_dir'};
349}
350
351sub getoutputinfo
352{
353    my $self = shift (@_);
354
355    return $self->{'output_info'};
356}
357
358
359sub get_output_handler
360{
361    my $self = shift (@_);
362
363    my ($output_file_name) = @_;
364
365    my $fh;
366    &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
367
368    return $fh;
369}
370
371sub release_output_handler
372{
373    my $self = shift (@_);
374    my ($outhandler) = @_;
375
376    close($outhandler);
377
378}
379
380sub output_xml_header {
381    my $self = shift (@_);
382    my ($handle,$docroot,$nondoctype) = @_;
383   
384   
385    #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
386   
387    #For Dspace must be UTF in lower case
388    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
389   
390    if (!defined $nondoctype){
391    my $doctype = (defined $docroot) ? $docroot : "Section";
392
393    # Used to be '<!DOCTYPE Archive SYSTEM ...'
394   
395    print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
396    }
397
398    print $handle "<$docroot>\n" if defined $docroot;
399}
400
401sub output_xml_footer {
402    my $self = shift (@_);
403    my ($handle,$docroot) = @_;
404    print $handle "</$docroot>\n" if defined $docroot;
405}
406
407
408sub output_general_xml_header
409{
410    my $self = shift (@_);
411    my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
412   
413    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
414   
415    if (defined $opt_dtd) {
416        my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
417    print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
418    }
419
420    if (defined $docroot) {
421        my $full_docroot = $docroot;
422        if (defined $opt_attributes) {
423          $full_docroot .= " $opt_attributes";
424        }
425
426        print $handle "<$full_docroot>\n"
427      }
428}
429
430sub output_general_xml_footer
431{
432    output_xml_footer(@_);
433}
434
435
436sub process {
437    my $self = shift (@_);
438    my ($doc_obj) = @_;
439   
440    # for OAI purposes
441    $doc_obj->set_lastmodified();
442    $doc_obj->set_oailastmodified();
443
444     if ($self->{'group_size'} > 1) {
445     $self->group_process ($doc_obj);
446    return;
447    }
448
449    my $OID = $doc_obj->get_OID();
450    $OID = "NULL" unless defined $OID;     
451
452    my $top_section = $doc_obj->get_top_section();
453
454    #get document's directory
455    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
456   
457    my $output_info = $self->{'output_info'};
458    return if (!defined $output_info);
459     
460    ##############################
461    # call subclass' saveas method
462    ##############################
463    $self->saveas($doc_obj,$doc_dir);
464    $self->archiveinf_db($doc_obj,$doc_dir);
465
466}
467
468sub store_output_info_reference {
469    my $self = shift (@_);
470    my ($doc_obj) = @_;
471
472    my $output_info = $self->{'output_info'};
473    my $metaname = $self->{'sortmeta'};
474    if (!defined $metaname || $metaname !~ /\S/) {
475    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
476    return;
477    }
478   
479    my $metadata = "";
480    my $top_section = $doc_obj->get_top_section();
481   
482    my @commameta_list = split(/,/, $metaname);
483    foreach my $cmn (@commameta_list) {
484    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
485    if ($meta) {
486        # do remove prefix/suffix - this will apply to all values
487        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
488        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
489        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
490        $metadata .= $meta if ($meta);
491    }
492    }
493
494    # store reference in the output_info     
495    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
496   
497}
498
499sub group_process {
500
501    my $self = shift (@_);
502    my ($doc_obj) = @_;
503   
504    my $OID = $doc_obj->get_OID();
505    $OID = "NULL" unless defined $OID;
506
507    my $groupsize = $self->{'group_size'};
508    my $gs_count = $self->{'gs_count'};
509    my $open_new_file = (($gs_count % $groupsize)==0);
510    my $outhandle = $self->{'output_handle'};
511
512    # opening a new file, or document has assoicated files => directory needed
513    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
514         
515        # The directory the archive file (doc.xml) and all associated files
516        # should end up in
517        my $doc_dir;
518        # If we've determined its time for a new file, open it now
519        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
520          {
521            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
522            # only if opening new file
523        my $output_dir = $self->get_output_dir();
524        &util::mk_all_dir ($output_dir) unless -e $output_dir;
525        my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
526        my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
527       
528        if ($gs_count>0)
529        {
530        return if (!$self->close_file_output());
531        }
532
533        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
534           
535
536        $self->{'gs_filename'} = $doc_file;
537        $self->{'short_doc_file'} = $short_doc_file;
538        $self->{'gs_OID'} = $OID;
539            $self->{'gs_doc_dir'} = $doc_dir;
540
541        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
542    }
543        # Otherwise load the same archive document directory used last time
544        else
545          {
546            $doc_dir = $self->{'gs_doc_dir'};
547          }
548
549    # copy all the associated files, add this information as metadata
550    # to the document
551        print $outhandle "Writing associated files to $doc_dir\n";
552    $self->process_assoc_files ($doc_obj, $doc_dir);
553
554    # look up 'gsdlmetafile' metadata and store that information
555    # explicitly in $doc_obj
556    $self->process_metafiles_metadata ($doc_obj);
557    }
558
559    # save this document
560    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
561    print GROUPPROCESS $section_text;
562
563    $self->{'gs_count'}++;
564}
565
566
567sub saveas {
568    my $self = shift (@_);
569   
570    die "Basplug::saveas function must be implemented in sub classes\n";
571}
572
573sub get_doc_dir {
574    my $self = shift (@_);
575    my ($OID, $source_filename) = @_;
576
577    my $working_dir  = $self->get_output_dir();
578    my $working_info = $self->{'output_info'};
579    return if (!defined $working_info);
580
581    my $doc_info = $working_info->get_info($OID);
582    my $doc_dir = '';
583
584    if (defined $doc_info && scalar(@$doc_info) >= 1)
585    {
586    # This OID already has an archives directory, so use it again
587    $doc_dir = $doc_info->[0];
588    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
589    }
590    elsif ($self->{'keep_import_structure'})
591    {
592    $source_filename = &File::Basename::dirname($source_filename);
593    $source_filename =~ s/[\\\/]+/\//g;
594    $source_filename =~ s/\/$//;
595
596        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
597    }
598
599    # We have to use a new archives directory for this document
600    if ($doc_dir eq "")
601    {
602    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
603    }
604
605    if (!defined $self->{'group'} || !$self->{'group'}){
606    &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
607    }
608
609    return $doc_dir;
610}
611
612sub get_new_doc_dir{
613   my $self = shift (@_); 
614   my($working_info,$working_dir,$OID) = @_;     
615   
616   
617   my $doc_dir = "";
618   my $doc_dir_rest = $OID;
619
620   # remove any \ and / from the OID
621   $doc_dir_rest =~ s/[\\\/]//g;
622
623   # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
624   $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
625
626   my $doc_dir_num = 0;
627
628   do {
629       $doc_dir .= "/" if $doc_dir_num > 0;
630       my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
631       if ($self->{'subdir_hash_prefix'})
632       {
633         $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
634       }
635       #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
636       if ($doc_dir_rest =~ s/$pattern//i)
637       {
638       $doc_dir .= $1;
639       $doc_dir_num++;
640       }
641   } while ($doc_dir_rest ne "" &&
642        ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
643         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
644   my $i = 1;
645   my $doc_dir_base = $doc_dir;
646   while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
647       $doc_dir = "$doc_dir_base-$i";
648       $i++;
649   }
650         
651   return "$doc_dir.dir";
652}
653
654sub process_assoc_files {
655    my $self = shift (@_);
656    my ($doc_obj, $doc_dir, $handle) = @_;
657
658    my $outhandle = $self->{'output_handle'};
659   
660    my $output_dir = $self->get_output_dir();
661    return if (!defined $output_dir);
662
663    &util::mk_all_dir ($output_dir) unless -e $output_dir;
664     
665    my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
666    &util::mk_all_dir ($working_dir) unless -e $working_dir;
667
668    my @assoc_files = ();
669    my $filename;;
670
671    my $source_filename = $doc_obj->get_source_filename();
672
673    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
674
675    if (defined $collect_dir) {
676    my $dirsep_regexp = &util::get_os_dirsep();
677
678    if ($collect_dir !~ /$dirsep_regexp$/) {
679        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
680    }
681
682    # This test is never going to fail on Windows -- is this a problem?
683     
684    if ($source_filename !~ /^$dirsep_regexp/) {
685        $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
686    }
687    }
688
689
690    # set the assocfile path (even if we have no assoc files - need this for lucene)
691    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
692                     "assocfilepath",
693                     "$doc_dir");
694    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
695    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
696    $dir = "" unless defined $dir;
697       
698    my $utf8_real_filename = $assoc_file_rec->[0];
699
700    # for some reasons the image associate file has / before the full path
701    $utf8_real_filename =~ s/^\\(.*)/$1/i;
702
703##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
704    my $real_filename = $utf8_real_filename;
705    $real_filename = &util::downgrade_if_dos_filename($real_filename);
706
707    if (-e $real_filename) {
708
709        $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
710
711            &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
712
713        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
714                     "gsdlassocfile",
715                     "$afile:$assoc_file_rec->[2]:$dir");
716    } elsif ($self->{'verbosity'} > 1) {
717        print $outhandle "BasePlugout::process couldn't copy the associated file " .
718        "$real_filename to $afile\n";
719    }
720    }
721}
722
723
724sub process_metafiles_metadata
725{
726    my $self = shift (@_);
727    my ($doc_obj) = @_;
728
729    my $top_section = $doc_obj->get_top_section();
730    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
731
732    foreach my $metafile_pair (@$metafiles) {
733    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
734
735    $doc_obj->metadata_file($full_metafile,$metafile);
736    }
737
738    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
739}
740
741sub archiveinf_files_to_field
742{
743    my $self = shift(@_);
744    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
745
746    foreach my $file_rec (@$files) {
747    my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
748    my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
749    # for some reasons the image associate file has / before the full path
750    $real_filename =~ s/^\\(.*)/$1/i;
751
752    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
753
754    if (-e $raw_filename) {
755
756#       if (defined $collect_dir) {
757#       my $collect_dir_re_safe = $collect_dir;
758#       $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
759#       $collect_dir_re_safe =~ s/\./\\./g;##
760
761#       $real_filename =~ s/^$collect_dir_re_safe//;
762#       }
763       
764        if (defined $reverse_lookups) {
765        $reverse_lookups->{$real_filename} = 1;
766        }
767###     push(@{$oid_files->{$field}},$full_file);
768        push(@{$oid_files->{$field}},$raw_filename);
769    }
770    else {
771        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
772    }
773    }
774}
775
776sub archiveinf_db
777{
778    my $self = shift (@_);
779    my ($doc_obj) = @_;
780
781    my $verbosity = $self->{'verbosity'};
782
783    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
784    if (defined $collect_dir) {
785    my $dirsep_regexp = &util::get_os_dirsep();
786
787    if ($collect_dir !~ /$dirsep_regexp$/) {
788        # ensure there is a slash at the end
789        $collect_dir .= &util::get_dirsep();
790    }
791    }
792
793    my $oid = $doc_obj->get_OID();
794    my $source_filename = $doc_obj->get_unmodified_source_filename();
795    my $working_info = $self->{'output_info'};
796    my $doc_info = $working_info->get_info($oid);
797
798    my ($doc_file,$index_status,$sortmeta) = @$doc_info;
799    # doc_file is the path to the archive doc.xml. Make sure it has unix
800    # slashes, then if the collection is copied to linux, it can be built without reimport
801    $doc_file =~ s/\\/\//g;
802    my $oid_files = { 'doc-file' => $doc_file,
803              'index-status' => $index_status,
804              'src-file' => $source_filename,
805              'sort-meta' => $sortmeta,
806              'assoc-file' => [],
807              'meta-file'  => [] };
808   
809    my $reverse_lookups = { $source_filename => "1" };
810
811
812    $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
813                     $collect_dir,$oid_files,$reverse_lookups);
814
815
816    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
817                     $collect_dir,$oid_files);
818
819    # Get the infodbtype value for this collection from the arcinfo object
820    my $infodbtype = $self->{'output_info'}->{'infodbtype'};
821    my $output_dir = $self->{'output_dir'};
822
823    my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
824    my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
825
826    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
827
828    if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
829    my $top_section = $doc_obj->get_top_section();
830
831    my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
832    my $dc_title = join("; ", @$dc_titles);
833
834    if ($oid_files->{'index-status'} eq "R") {
835        $dc_title .= " (Updated)";
836    }
837
838    my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
839    if (open(RSSOUT,">>$rss_filename")) {
840        print RSSOUT "<item>\n";
841        print RSSOUT "   <title>$dc_title</title>\n";
842        print RSSOUT "   <link>_httpdomain__httpcollection_/document/$oid</link>\n";
843        print RSSOUT "</item>\n";
844        close(RSSOUT);
845    }
846    else {
847        print STDERR "**** Failed to open $rss_filename\n!$\n";
848    }
849
850
851    }
852
853    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
854    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
855    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
856    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
857
858    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
859    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
860    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
861
862    foreach my $rl (keys %$reverse_lookups) {
863    $working_info->add_reverseinfo($rl,$oid);
864    } 
865
866    # meta files not set in reverese entry, but need to set the metadata flag
867    if (defined $doc_obj->get_meta_files()) {
868    foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
869        my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
870        $working_info->set_meta_file_flag($full_file);
871    }
872    }
873}
874
875
876sub set_sortmeta {
877    my $self = shift (@_);
878    my ($sortmeta, $removeprefix, $removesuffix) = @_;
879   
880    $self->{'sortmeta'} = $sortmeta;
881    if (defined ($removeprefix) && $removeprefix ) {
882    $removeprefix =~ s/^\^//; # don't need a leading ^
883    $self->{'removeprefix'} = $removeprefix;
884    }
885    if (defined ($removesuffix) && $removesuffix) {
886    $removesuffix =~ s/\$$//; # don't need a trailing $
887    $self->{'removesuffix'} = $removesuffix;
888    }
889}
890
891sub open_xslt_pipe
892{
893    my $self = shift @_;
894    my ($output_file_name, $xslt_file)=@_;
895
896    return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
897   
898    my $java_class_path =  &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
899
900    my $mapping_file_path = "";
901
902    if ($ENV{'GSDLOS'} eq "windows"){
903    $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
904    # this file:/// bit didn't work for me on windows XP
905    #$xslt_file = "\"file:///".$xslt_file."\"";
906    #$mapping_file_path = "\"file:///";
907    }
908    else{
909    $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
910    }
911
912
913    $java_class_path = "\"".$java_class_path."\"";
914
915    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
916
917    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
918    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
919    $cmd .= "-m $mapping_file_path";
920    }
921   
922    open(*XMLWRITER, $cmd)
923    or die "can't open pipe to xslt: $!";
924
925   
926    $self->{'xslt_writer'} = *XMLWRITER;
927
928    print XMLWRITER "<?DocStart?>\n";       
929    print XMLWRITER "$output_file_name\n";
930
931 
932  }
933 
934
935sub close_xslt_pipe
936{
937  my $self = shift @_;
938
939 
940  return unless defined $self->{'xslt_writer'} ;
941   
942  my $xsltwriter = $self->{'xslt_writer'};
943 
944  print $xsltwriter "<?DocEnd?>\n";
945  close($xsltwriter);
946
947  undef $self->{'xslt_writer'};
948
949}
950
951sub close_file_output
952{
953    my ($self) = @_;
954 
955    # make sure that the handle has been opened - it won't be if we failed
956    # to import any documents...
957    if (defined(fileno(GROUPPROCESS))) {
958    $self->output_xml_footer('GROUPPROCESS','Archive');   
959    close GROUPPROCESS;
960    }
961
962    my $OID = $self->{'gs_OID'};
963    my $short_doc_file = $self->{'short_doc_file'};
964   
965    if ($self->{'gzip'}) {
966    my $doc_file = $self->{'gs_filename'};
967    `gzip $doc_file`;
968    $doc_file .= ".gz";
969    $short_doc_file .= ".gz";
970    if (!-e $doc_file) {
971         my $outhandle = $self->{'output_handle'};
972        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
973        return 0;
974    }
975    }
976
977    # store reference in output_info
978    my $output_info = $self->{'output_info'};
979    return 0 if (!defined $output_info);
980    $output_info->add_info($OID, $short_doc_file, undef, undef);
981    return 1;
982}
983
984
985#the subclass should implement this method if is_group method could return 1.
986sub close_group_output{
987   my $self = shift (@_);       
988}
989
990sub is_group {
991    my $self = shift (@_);
992    return 0;       
993}
994
995my $dc_set = { Title => 1,       
996           Creator => 1,
997           Subject => 1,
998           Description => 1,
999           Publisher => 1,
1000           Contributor => 1,
1001           Date => 1,
1002           Type => 1,
1003           Format => 1,
1004           Identifier => 1,
1005           Source => 1,
1006           Language => 1,
1007           Relation => 1,
1008           Coverage => 1,
1009           Rights => 1};
1010
1011
1012# returns an XML representation of the dublin core metadata
1013# if dc meta is not found, try ex meta
1014# This method is not used by the DSpacePlugout, which has its
1015# own method to save its dc metadata
1016sub get_dc_metadata {
1017    my $self = shift(@_);
1018    my ($doc_obj, $section, $version) = @_;
1019   
1020    # build up string of dublin core metadata
1021    $section="" unless defined $section;
1022   
1023    my $section_ptr = $doc_obj->_lookup_section($section);
1024    return "" unless defined $section_ptr;
1025
1026
1027    my $explicit_dc = {};
1028    my $explicit_ex_dc = {};
1029    my $explicit_ex = {};
1030
1031    my $all_text="";
1032   
1033    # We want high quality dc metadata to go in first, so we store all the
1034    # assigned dc.* values first. Then, for all those dc metadata names in
1035    # the official dc set that are as yet unassigned, we look to see whether
1036    # embedded ex.dc.* metadata has defined some values for them. If not,
1037    # then for the same missing dc metadata names, we look in ex metadata.
1038
1039    foreach my $data (@{$section_ptr->{'metadata'}}){
1040    my $escaped_value = &docprint::escape_text($data->[1]);
1041    if ($data->[0]=~ m/^dc\./) {
1042        $data->[0] =~ tr/[A-Z]/[a-z]/;
1043
1044        $data->[0] =~ m/^dc\.(.*)/;
1045        my $dc_element =  $1;
1046
1047        if (!defined $explicit_dc->{$dc_element}) {
1048        $explicit_dc->{$dc_element} = [];
1049        }
1050        push(@{$explicit_dc->{$dc_element}},$escaped_value);
1051
1052        if (defined $version && ($version eq "oai_dc")) {
1053        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1054        }
1055        else {
1056        # qualifier???
1057        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1058        }
1059
1060    } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1061        $data->[0] =~ m/^ex\.dc\.(.*)/;
1062        my $ex_dc_element = $1;
1063        my $lc_ex_dc_element = lc($ex_dc_element);
1064
1065        # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1066        if (defined $dc_set->{$ex_dc_element}) {
1067        if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1068            $explicit_ex_dc->{$lc_ex_dc_element} = [];
1069        }
1070        push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1071        }
1072    }
1073    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1074        $data->[0] =~ m/^(ex\.)?(.*)/;
1075        my $ex_element = $2;
1076        my $lc_ex_element = lc($ex_element);
1077
1078        if (defined $dc_set->{$ex_element}) {
1079        if (!defined $explicit_ex->{$lc_ex_element}) {
1080            $explicit_ex->{$lc_ex_element} = [];
1081        }
1082        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1083        }
1084    }
1085    }
1086
1087    # go through dc_set and for any element *not* defined in explicit_dc
1088    # that does exist in explicit_ex, add it in as metadata
1089    foreach my $k ( keys %$dc_set ) {
1090    my $lc_k = lc($k);
1091
1092    if (!defined $explicit_dc->{$lc_k}) {
1093        # try to find if ex.dc.* defines this dc.* meta,
1094        # if not, then look for whether there's an ex.* equivalent
1095
1096        if (defined $explicit_ex_dc->{$lc_k}) {
1097        foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1098            my $dc_element    = $lc_k;
1099            my $escaped_value = $v;
1100           
1101            if (defined $version && ($version eq "oai_dc")) {
1102            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1103            }
1104            else {
1105            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1106            }           
1107        }
1108        } elsif (defined $explicit_ex->{$lc_k}) {
1109        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1110            my $dc_element    = $lc_k;
1111            my $escaped_value = $v;
1112
1113            if (defined $version && ($version eq "oai_dc")) {
1114            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1115            }
1116            else {
1117            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1118            }
1119        }
1120        }
1121    }
1122    }
1123
1124    if ($all_text eq "") {
1125    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1126    }   
1127    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1128
1129    return $all_text;
1130}
1131
1132# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1133# This method was apparently added by Jeffrey and committed by Shaoqun.
1134# But we don't know why it was added, so not using it anymore.
1135sub new_get_dc_metadata {
1136   
1137    my $self = shift(@_);
1138    my ($doc_obj, $section, $version) = @_;
1139
1140    # build up string of dublin core metadata
1141    $section="" unless defined $section;
1142   
1143    my $section_ptr=$doc_obj->_lookup_section($section);
1144    return "" unless defined $section_ptr;
1145
1146    my $all_text = "";
1147    foreach my $data (@{$section_ptr->{'metadata'}}){
1148    my $escaped_value = &docprint::escape_text($data->[1]);
1149    my $dc_element =  $data->[0];
1150   
1151    my @array = split('\.',$dc_element);
1152    my ($type,$name);
1153
1154    if(defined $array[1])
1155    {
1156        $type = $array[0];
1157        $name = $array[1];
1158    }
1159    else
1160    {
1161        $type = "ex";
1162        $name = $array[0];
1163    }
1164   
1165    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1166    }
1167    return $all_text;
1168}
1169
1170
11711;
Note: See TracBrowser for help on using the browser.