root/main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm @ 24404

Revision 24404, 31.2 KB (checked in by ak19, 9 years ago)

Changes to perl code to do with removing the ex. prefix: ex. is only removed if it is the sole prefix (i.e. ex.dc.* prefixes are not removed).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60     'deft' => "",
61    'hiddengli' => "no"},
62       { 'name' => "output_handle",
63    'desc' => "{BasPlugout.output_handle}",
64    'type' => "string",
65        'deft' =>  'STDERR',
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68       { 'name' => "verbosity",
69    'desc' => "{BasPlugout.verbosity}",
70    'type' => "int",
71        'deft' =>  "0",
72    'reqd' => "no", 
73        'hiddengli' => "no"},
74       { 'name' => "gzip_output",
75    'desc' => "{BasPlugout.gzip_output}",
76    'type' => "flag",
77    'reqd' => "no", 
78        'hiddengli' => "no"},
79       { 'name' => "debug",
80     'desc' => "{BasPlugout.debug}",
81     'type' => "flag",
82     'reqd' => "no",
83     'hiddengli' => "yes"}
84];
85
86my $options = { 'name'     => "BasePlugout",
87        'desc'     => "{BasPlugout.desc}",
88        'abstract' => "yes",
89        'inherits' => "no",
90        'args'     => $arguments};
91
92sub new
93{
94    my $class = shift (@_);
95
96    my ($plugoutlist,$args,$hashArgOptLists) = @_;
97    push(@$plugoutlist, $class);
98
99    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102    push(@{$hashArgOptLists->{"OptList"}},$options);
103
104    my $self = {};
105    $self->{'plugout_type'} = $class;
106    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107    $self->{"info_only"} = 0;
108
109    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110    # the args, just return the object. 
111    foreach my $strArg (@{$args})
112    {
113    if(defined $strArg && $strArg eq "-gsdlinfo")
114    {
115        $self->{"info_only"} = 1;
116        return bless $self, $class;
117    }
118    }
119   
120    delete $self->{"info_only"};
121   
122    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123    {
124    my $classTempClass = bless $self, $class;
125    print STDERR "<BadPlugout d=$plugout_name>\n";
126    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127    $classTempClass->print_txt_usage("");  # Use default resource bundle
128    die "\n";
129    }
130
131 
132    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
133    {
134    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135    if (!defined $full_file_path) {
136        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137        die "\n";
138    }
139    $self->{'xslt_file'} = $full_file_path;
140    }
141
142    $self->{'gs_count'} = 0;   
143
144    $self->{'keep_import_structure'} = 0;
145
146    return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152    my $self = shift(@_);
153    my $header = shift(@_);
154    my $high_level_information_only = shift(@_);
155   
156    # XML output is always in UTF-8
157    gsprintf::output_strings_in_UTF8;
158
159    if ($header) {
160    &PrintUsage::print_xml_header("plugout");
161    }
162    $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168    my $self = shift(@_);
169    my $high_level_information_only = shift(@_);
170
171    my $optionlistref = $self->{'option_list'};
172    my @optionlist = @$optionlistref;
173    my $plugoutoptions = shift(@$optionlistref);
174    return if (!defined($plugoutoptions));
175
176    gsprintf(STDERR, "<PlugoutInfo>\n");
177    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
178    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179    $desc =~ s/</&amp;lt;/g; # doubly escaped
180    $desc =~ s/>/&amp;gt;/g;
181    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
182    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184    unless (defined($high_level_information_only)) {
185    gsprintf(STDERR, "  <Arguments>\n");
186    if (defined($plugoutoptions->{'args'})) {
187        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188    }
189    gsprintf(STDERR, "  </Arguments>\n");
190
191    # Recurse up the plugout hierarchy
192    $self->print_xml();
193    }
194    gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200    my $self = shift(@_);
201
202    # Print the usage message for a plugout (recursively)
203    my $descoffset = $self->determine_description_offset(0);
204    $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209    my $self = shift(@_);
210    my $maxoffset = shift(@_);
211
212    my $optionlistref = $self->{'option_list'};
213    my @optionlist = @$optionlistref;
214    my $plugoutoptions = pop(@$optionlistref);
215    return $maxoffset if (!defined($plugoutoptions));
216
217    # Find the length of the longest option string of this download
218    my $plugoutargs = $plugoutoptions->{'args'};
219    if (defined($plugoutargs)) {
220    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221    if ($longest > $maxoffset) {
222        $maxoffset = $longest;
223    }
224    }
225
226    # Recurse up the download hierarchy
227    $maxoffset = $self->determine_description_offset($maxoffset);
228    $self->{'option_list'} = \@optionlist;
229    return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235    my $self = shift(@_);
236    my $descoffset = shift(@_);
237    my $isleafclass = shift(@_);
238
239    my $optionlistref = $self->{'option_list'};
240    my @optionlist = @$optionlistref;
241    my $plugoutoptions = shift(@$optionlistref);
242    return if (!defined($plugoutoptions));
243
244    my $plugoutname = $plugoutoptions->{'name'};
245    my $plugoutargs = $plugoutoptions->{'args'};
246    my $plugoutdesc = $plugoutoptions->{'desc'};
247
248    # Produce the usage information using the data structure above
249    if ($isleafclass) {
250    if (defined($plugoutdesc)) {
251        gsprintf(STDERR, "$plugoutdesc\n\n");
252    }
253    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254    }
255
256    # Display the download options, if there are some
257    if (defined($plugoutargs)) {
258    # Calculate the column offset of the option descriptions
259    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
260
261    if ($isleafclass) {
262        gsprintf(STDERR, " {common.specific_options}:\n");
263    }
264    else {
265        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266    }
267
268    # Display the download options
269    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270    }
271
272    # Recurse up the download hierarchy
273    $self->print_plugout_usage($descoffset, 0);
274    $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280      my ($strFunctionName,$strError) = @_;
281    {
282    print "Error occoured in BasePlugout.pm\n".
283        "In Function: ".$strFunctionName."\n".
284        "Error Message: ".$strError."\n";
285    exit(-1);
286    } 
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291    my $self = shift (@_);
292    my ($type, $metadata) = @_;
293
294    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295    $self->{'OIDtype'} = $type;
296    } else {
297    $self->{'OIDtype'} = "hash";
298    }
299    if ($type =~ /^assigned$/) {
300    if (defined $metadata) {
301        $self->{'OIDmetadata'} = $metadata;
302    } else {
303        $self->{'OIDmetadata'} = "dc.Identifier";
304    }
305    }
306}
307
308sub set_output_dir
309{
310    my $self = shift @_;
311    my ($output_dir) = @_;
312
313   $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318    my $self = shift @_;
319    my ($output_dir) = @_;
320
321    $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326    my $self = shift (@_);
327
328   return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333    my $self = shift (@_);
334
335    return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340    my $self = shift (@_);
341
342    return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348    my $self = shift (@_);
349
350    my ($output_file_name) = @_;
351     
352    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
353
354    return *OUTPUT;           
355}
356
357sub release_output_handler
358{
359    my $self = shift (@_);
360    my ($outhandler) = @_;
361
362    close($outhandler);
363
364}
365
366sub output_xml_header {
367    my $self = shift (@_);
368    my ($handle,$docroot,$nondoctype) = @_;
369   
370   
371    #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372   
373    #For Dspace must be UTF in lower case
374    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
375   
376    if (!defined $nondoctype){
377    my $doctype = (defined $docroot) ? $docroot : "Section";
378
379    # Used to be '<!DOCTYPE Archive SYSTEM ...'
380   
381    print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
382    }
383
384    print $handle "<$docroot>\n" if defined $docroot;
385}
386
387sub output_xml_footer {
388    my $self = shift (@_);
389    my ($handle,$docroot) = @_;
390    print $handle "</$docroot>\n" if defined $docroot;
391}
392
393
394sub output_general_xml_header
395{
396    my $self = shift (@_);
397    my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
398   
399    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
400   
401    if (defined $opt_dtd) {
402        my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
403    print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
404    }
405
406    if (defined $docroot) {
407        my $full_docroot = $docroot;
408        if (defined $opt_attributes) {
409          $full_docroot .= " $opt_attributes";
410        }
411
412        print $handle "<$full_docroot>\n"
413      }
414}
415
416sub output_general_xml_footer
417{
418    output_xml_footer(@_);
419}
420
421
422sub process {
423    my $self = shift (@_);
424    my ($doc_obj) = @_;
425   
426    # for OAI purposes
427    $doc_obj->set_lastmodified();
428    $doc_obj->set_oailastmodified();
429
430     if ($self->{'group_size'} > 1) {
431     $self->group_process ($doc_obj);
432    return;
433    }
434
435    my $OID = $doc_obj->get_OID();
436    $OID = "NULL" unless defined $OID;     
437
438    my $top_section = $doc_obj->get_top_section();
439
440    #get document's directory
441    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
442   
443    my $output_info = $self->{'output_info'};
444    return if (!defined $output_info);
445     
446    ##############################
447    # call subclass' saveas method
448    ##############################
449    $self->saveas($doc_obj,$doc_dir);
450    $self->archiveinf_db($doc_obj,$doc_dir);
451
452}
453
454sub store_output_info_reference {
455    my $self = shift (@_);
456    my ($doc_obj) = @_;
457
458    my $output_info = $self->{'output_info'};
459    my $metaname = $self->{'sortmeta'};
460    if (!defined $metaname || $metaname !~ /\S/) {
461    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
462    return;
463    }
464   
465    my $metadata = "";
466    my $top_section = $doc_obj->get_top_section();
467   
468    my @commameta_list = split(/,/, $metaname);
469    foreach my $cmn (@commameta_list) {
470    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
471    if ($meta) {
472        # do remove prefix/suffix - this will apply to all values
473        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
474        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
475        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
476        $metadata .= $meta if ($meta);
477    }
478    }
479
480    # store reference in the output_info     
481    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
482   
483}
484
485sub group_process {
486
487    my $self = shift (@_);
488    my ($doc_obj) = @_;
489   
490    my $OID = $doc_obj->get_OID();
491    $OID = "NULL" unless defined $OID;
492
493    my $groupsize = $self->{'group_size'};
494    my $gs_count = $self->{'gs_count'};
495    my $open_new_file = (($gs_count % $groupsize)==0);
496    my $outhandle = $self->{'output_handle'};
497
498    # opening a new file, or document has assoicated files => directory needed
499    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
500         
501        # The directory the archive file (doc.xml) and all associated files
502        # should end up in
503        my $doc_dir;
504        # If we've determined its time for a new file, open it now
505        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
506          {
507            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
508            # only if opening new file
509        my $output_dir = $self->get_output_dir();
510        &util::mk_all_dir ($output_dir) unless -e $output_dir;
511        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
512        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
513       
514        if ($gs_count>0)
515        {
516        return if (!$self->close_file_output());
517        }
518
519        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
520           
521
522        $self->{'gs_filename'} = $doc_file;
523        $self->{'short_doc_file'} = $short_doc_file;
524        $self->{'gs_OID'} = $OID;
525            $self->{'gs_doc_dir'} = $doc_dir;
526
527        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
528    }
529        # Otherwise load the same archive document directory used last time
530        else
531          {
532            $doc_dir = $self->{'gs_doc_dir'};
533          }
534
535    # copy all the associated files, add this information as metadata
536    # to the document
537        print $outhandle "Writing associated files to $doc_dir\n";
538    $self->process_assoc_files ($doc_obj, $doc_dir);
539
540    # look up 'gsdlmetafile' metadata and store that information
541    # explicitly in $doc_obj
542    $self->process_metafiles_metadata ($doc_obj);
543    }
544
545    # save this document
546    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
547    print GROUPPROCESS $section_text;
548
549    $self->{'gs_count'}++;
550}
551
552
553sub saveas {
554    my $self = shift (@_);
555   
556    die "Basplug::saveas function must be implemented in sub classes\n";
557}
558
559sub get_doc_dir {
560    my $self = shift (@_);
561    my ($OID, $source_filename) = @_;
562
563    my $working_dir  = $self->get_output_dir();
564    my $working_info = $self->{'output_info'};
565    return if (!defined $working_info);
566
567    my $doc_info = $working_info->get_info($OID);
568    my $doc_dir = '';
569
570    if (defined $doc_info && scalar(@$doc_info) >= 1)
571    {
572    # This OID already has an archives directory, so use it again
573    $doc_dir = $doc_info->[0];
574    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
575    }
576    elsif ($self->{'keep_import_structure'})
577    {
578    $source_filename = &File::Basename::dirname($source_filename);
579    $source_filename =~ s/[\\\/]+/\//g;
580    $source_filename =~ s/\/$//;
581
582        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
583    }
584
585    # We have to use a new archives directory for this document
586    if ($doc_dir eq "")
587    {
588    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
589    }
590
591    if (!defined $self->{'group'} || !$self->{'group'}){
592    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
593    }
594
595    return $doc_dir;
596}
597
598sub get_new_doc_dir{
599   my $self = shift (@_); 
600   my($working_info,$working_dir,$OID) = @_;     
601   
602   
603   my $doc_dir = "";
604   my $doc_dir_rest = $OID;
605
606   # remove any \ and / from the OID
607   $doc_dir_rest =~ s/[\\\/]//g;
608
609   # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
610   $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
611
612   my $doc_dir_num = 0;
613
614   do {
615       $doc_dir .= "/" if $doc_dir_num > 0;
616       if ($doc_dir_rest =~ s/^(.{1,8})//) {
617       $doc_dir .= $1;
618       $doc_dir_num++;
619       }
620   } while ($doc_dir_rest ne "" &&
621        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
622         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
623   my $i = 1;
624   my $doc_dir_base = $doc_dir;
625   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
626       $doc_dir = "$doc_dir_base-$i";
627       $i++;
628   }
629         
630   return "$doc_dir.dir";
631}
632
633sub process_assoc_files {
634    my $self = shift (@_);
635    my ($doc_obj, $doc_dir, $handle) = @_;
636
637    my $outhandle = $self->{'output_handle'};
638   
639    my $output_dir = $self->get_output_dir();
640    return if (!defined $output_dir);
641
642    &util::mk_all_dir ($output_dir) unless -e $output_dir;
643     
644    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
645    &util::mk_all_dir ($working_dir) unless -e $working_dir;
646
647    my @assoc_files = ();
648    my $filename;;
649
650    my $source_filename = $doc_obj->get_source_filename();
651
652    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
653
654    if (defined $collect_dir) {
655    my $dirsep_regexp = &util::get_os_dirsep();
656
657    if ($collect_dir !~ /$dirsep_regexp$/) {
658        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
659    }
660
661    # This test is never going to fail on Windows -- is this a problem?
662     
663    if ($source_filename !~ /^$dirsep_regexp/) {
664        $source_filename = &util::filename_cat($collect_dir, $source_filename);
665    }
666    }
667
668
669    # set the assocfile path (even if we have no assoc files - need this for lucene)
670    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
671                     "assocfilepath",
672                     "$doc_dir");
673    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
674    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
675    $dir = "" unless defined $dir;
676       
677    my $utf8_real_filename = $assoc_file_rec->[0];
678
679    # for some reasons the image associate file has / before the full path
680    $utf8_real_filename =~ s/^\\(.*)/$1/i;
681
682##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
683    my $real_filename = $utf8_real_filename;
684    $real_filename = &util::downgrade_if_dos_filename($real_filename);
685
686    if (-e $real_filename) {
687
688        $filename = &util::filename_cat($working_dir, $afile);
689
690        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
691       
692        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
693                     "gsdlassocfile",
694                     "$afile:$assoc_file_rec->[2]:$dir");
695    } elsif ($self->{'verbosity'} > 1) {
696        print $outhandle "BasePlugout::process couldn't copy the associated file " .
697        "$real_filename to $afile\n";
698    }
699    }
700}
701
702
703sub process_metafiles_metadata
704{
705    my $self = shift (@_);
706    my ($doc_obj) = @_;
707
708    my $top_section = $doc_obj->get_top_section();
709    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
710
711    foreach my $metafile_pair (@$metafiles) {
712    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
713
714    $doc_obj->metadata_file($full_metafile,$metafile);
715    }
716
717    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
718}
719
720sub archiveinf_files_to_field
721{
722    my $self = shift(@_);
723    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
724
725    foreach my $file_rec (@$files) {
726    my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
727    my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
728    # for some reasons the image associate file has / before the full path
729    $real_filename =~ s/^\\(.*)/$1/i;
730
731    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
732
733    if (-e $raw_filename) {
734
735#       if (defined $collect_dir) {
736#       my $collect_dir_re_safe = $collect_dir;
737#       $collect_dir_re_safe =~ s/\\/\\\\/g;
738#       $collect_dir_re_safe =~ s/\./\\./g;##
739
740#       $real_filename =~ s/^$collect_dir_re_safe//;
741#       }
742       
743        if (defined $reverse_lookups) {
744        $reverse_lookups->{$real_filename} = 1;
745        }
746###     push(@{$oid_files->{$field}},$full_file);
747        push(@{$oid_files->{$field}},$raw_filename);
748    }
749    else {
750        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
751    }
752    }
753}
754
755sub archiveinf_db
756{
757    my $self = shift (@_);
758    my ($doc_obj) = @_;
759
760    my $verbosity = $self->{'verbosity'};
761
762    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
763    if (defined $collect_dir) {
764    my $dirsep_regexp = &util::get_os_dirsep();
765
766    if ($collect_dir !~ /$dirsep_regexp$/) {
767        # ensure there is a slash at the end
768        $collect_dir .= &util::get_dirsep();
769    }
770    }
771
772    my $oid = $doc_obj->get_OID();
773    my $source_filename = $doc_obj->get_unmodified_source_filename();
774    my $working_info = $self->{'output_info'};
775    my $doc_info = $working_info->get_info($oid);
776
777    my ($doc_file,$index_status,$sortmeta) = @$doc_info;
778    # doc_file is the path to the archive doc.xml. Make sure it has unix
779    # slashes, then if the collection is copied to linux, it can be built without reimport
780    $doc_file =~ s/\\/\//g;
781    my $oid_files = { 'doc-file' => $doc_file,
782              'index-status' => $index_status,
783              'src-file' => $source_filename,
784              'sort-meta' => $sortmeta,
785              'assoc-file' => [],
786              'meta-file'  => [] };
787   
788    my $reverse_lookups = { $source_filename => "1" };
789
790
791    $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
792                     $collect_dir,$oid_files,$reverse_lookups);
793
794
795    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
796                     $collect_dir,$oid_files);
797
798    # Get the infodbtype value for this collection from the arcinfo object
799    my $infodbtype = $self->{'output_info'}->{'infodbtype'};
800    my $output_dir = $self->{'output_dir'};
801
802    my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
803    my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
804
805    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
806
807    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
808    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
809    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
810    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
811
812    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
813    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
814    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
815
816    foreach my $rl (keys %$reverse_lookups) {
817    $working_info->add_reverseinfo($rl,$oid);
818    } 
819
820    # meta files not set in reverese entry, but need to set the metadata flag
821    if (defined $doc_obj->get_meta_files()) {
822    foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
823        my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
824        $working_info->set_meta_file_flag($full_file);
825    }
826    }
827}
828
829
830sub set_sortmeta {
831    my $self = shift (@_);
832    my ($sortmeta, $removeprefix, $removesuffix) = @_;
833   
834    $self->{'sortmeta'} = $sortmeta;
835    if (defined ($removeprefix) && $removeprefix ) {
836    $removeprefix =~ s/^\^//; # don't need a leading ^
837    $self->{'removeprefix'} = $removeprefix;
838    }
839    if (defined ($removesuffix) && $removesuffix) {
840    $removesuffix =~ s/\$$//; # don't need a trailing $
841    $self->{'removesuffix'} = $removesuffix;
842    }
843}
844
845sub open_xslt_pipe
846{
847    my $self = shift @_;
848    my ($output_file_name, $xslt_file)=@_;
849
850    return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
851   
852    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
853
854    my $mapping_file_path = "";
855
856    if ($ENV{'GSDLOS'} eq "windows"){
857    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
858    # this file:/// bit didn't work for me on windows XP
859    #$xslt_file = "\"file:///".$xslt_file."\"";
860    #$mapping_file_path = "\"file:///";
861    }
862    else{
863    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
864    }
865
866
867    $java_class_path = "\"".$java_class_path."\"";
868
869    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
870
871    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
872    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
873    $cmd .= "-m $mapping_file_path";
874    }
875   
876    open(*XMLWRITER, $cmd)
877    or die "can't open pipe to xslt: $!";
878
879   
880    $self->{'xslt_writer'} = *XMLWRITER;
881
882    print XMLWRITER "<?DocStart?>\n";       
883    print XMLWRITER "$output_file_name\n";
884
885 
886  }
887 
888
889sub close_xslt_pipe
890{
891  my $self = shift @_;
892
893 
894  return unless defined $self->{'xslt_writer'} ;
895   
896  my $xsltwriter = $self->{'xslt_writer'};
897 
898  print $xsltwriter "<?DocEnd?>\n";
899  close($xsltwriter);
900
901  undef $self->{'xslt_writer'};
902
903}
904
905sub close_file_output
906{
907    my ($self) = @_;
908 
909    # make sure that the handle has been opened - it won't be if we failed
910    # to import any documents...
911    if (defined(fileno(GROUPPROCESS))) {
912    $self->output_xml_footer('GROUPPROCESS','Archive');   
913    close GROUPPROCESS;
914    }
915
916    my $OID = $self->{'gs_OID'};
917    my $short_doc_file = $self->{'short_doc_file'};
918   
919    if ($self->{'gzip'}) {
920    my $doc_file = $self->{'gs_filename'};
921    `gzip $doc_file`;
922    $doc_file .= ".gz";
923    $short_doc_file .= ".gz";
924    if (!-e $doc_file) {
925         my $outhandle = $self->{'output_handle'};
926        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
927        return 0;
928    }
929    }
930
931    # store reference in output_info
932    my $output_info = $self->{'output_info'};
933    return 0 if (!defined $output_info);
934    $output_info->add_info($OID, $short_doc_file, undef, undef);
935    return 1;
936}
937
938
939#the subclass should implement this method if is_group method could return 1.
940sub close_group_output{
941   my $self = shift (@_);       
942}
943
944sub is_group {
945    my $self = shift (@_);
946    return 0;       
947}
948
949my $dc_set = { Title => 1,       
950           Creator => 1,
951           Subject => 1,
952           Description => 1,
953           Publisher => 1,
954           Contributor => 1,
955           Date => 1,
956           Type => 1,
957           Format => 1,
958           Identifier => 1,
959           Source => 1,
960           Language => 1,
961           Relation => 1,
962           Coverage => 1,
963           Rights => 1};
964
965
966# returns an XML representation of the dublin core metadata
967# if dc meta is not found, try ex meta
968# This method is not used by the DSpacePlugout, which has its
969# own method to save its dc metadata
970sub get_dc_metadata {
971    my $self = shift(@_);
972    my ($doc_obj, $section, $version) = @_;
973   
974    # build up string of dublin core metadata
975    $section="" unless defined $section;
976   
977    my $section_ptr = $doc_obj->_lookup_section($section);
978    return "" unless defined $section_ptr;
979
980
981    my $explicit_dc = {};
982    my $explicit_ex_dc = {};
983    my $explicit_ex = {};
984
985    my $all_text="";
986   
987    # We want high quality dc metadata to go in first, so we store all the
988    # assigned dc.* values first. Then, for all those dc metadata names in
989    # the official dc set that are as yet unassigned, we look to see whether
990    # embedded ex.dc.* metadata has defined some values for them. If not,
991    # then for the same missing dc metadata names, we look in ex metadata.
992
993    foreach my $data (@{$section_ptr->{'metadata'}}){
994    my $escaped_value = &docprint::escape_text($data->[1]);
995    if ($data->[0]=~ m/^dc\./) {
996        $data->[0] =~ tr/[A-Z]/[a-z]/;
997
998        $data->[0] =~ m/^dc\.(.*)/;
999        my $dc_element =  $1;
1000
1001        if (!defined $explicit_dc->{$dc_element}) {
1002        $explicit_dc->{$dc_element} = [];
1003        }
1004        push(@{$explicit_dc->{$dc_element}},$escaped_value);
1005
1006        if (defined $version && ($version eq "oai_dc")) {
1007        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1008        }
1009        else {
1010        # qualifier???
1011        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1012        }
1013
1014    } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1015        $data->[0] =~ m/^ex\.dc\.(.*)/;
1016        my $ex_dc_element = $1;
1017        my $lc_ex_dc_element = lc($ex_dc_element);
1018
1019        # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1020        if (defined $dc_set->{$ex_dc_element}) {
1021        if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1022            $explicit_ex_dc->{$lc_ex_dc_element} = [];
1023        }
1024        push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1025        }
1026    }
1027    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1028        $data->[0] =~ m/^(ex\.)?(.*)/;
1029        my $ex_element = $2;
1030        my $lc_ex_element = lc($ex_element);
1031
1032        if (defined $dc_set->{$ex_element}) {
1033        if (!defined $explicit_ex->{$lc_ex_element}) {
1034            $explicit_ex->{$lc_ex_element} = [];
1035        }
1036        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1037        }
1038    }
1039    }
1040
1041    # go through dc_set and for any element *not* defined in explicit_dc
1042    # that does exist in explicit_ex, add it in as metadata
1043    foreach my $k ( keys %$dc_set ) {
1044    my $lc_k = lc($k);
1045
1046    if (!defined $explicit_dc->{$lc_k}) {
1047        # try to find if ex.dc.* defines this dc.* meta,
1048        # if not, then look for whether there's an ex.* equivalent
1049
1050        if (defined $explicit_ex_dc->{$lc_k}) {
1051        foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1052            my $dc_element    = $lc_k;
1053            my $escaped_value = $v;
1054           
1055            if (defined $version && ($version eq "oai_dc")) {
1056            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1057            }
1058            else {
1059            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1060            }           
1061        }
1062        } elsif (defined $explicit_ex->{$lc_k}) {
1063        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1064            my $dc_element    = $lc_k;
1065            my $escaped_value = $v;
1066
1067            if (defined $version && ($version eq "oai_dc")) {
1068            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1069            }
1070            else {
1071            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1072            }
1073        }
1074        }
1075    }
1076    }
1077
1078    if ($all_text eq "") {
1079    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1080    }   
1081    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1082
1083    return $all_text;
1084}
1085
1086# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1087# This method was apparently added by Jeffrey and committed by Shaoqun.
1088# But we don't know why it was added, so not using it anymore.
1089sub new_get_dc_metadata {
1090   
1091    my $self = shift(@_);
1092    my ($doc_obj, $section, $version) = @_;
1093
1094    # build up string of dublin core metadata
1095    $section="" unless defined $section;
1096   
1097    my $section_ptr=$doc_obj->_lookup_section($section);
1098    return "" unless defined $section_ptr;
1099
1100    my $all_text = "";
1101    foreach my $data (@{$section_ptr->{'metadata'}}){
1102    my $escaped_value = &docprint::escape_text($data->[1]);
1103    my $dc_element =  $data->[0];
1104   
1105    my @array = split('\.',$dc_element);
1106    my ($type,$name);
1107
1108    if(defined $array[1])
1109    {
1110        $type = $array[0];
1111        $name = $array[1];
1112    }
1113    else
1114    {
1115        $type = "ex";
1116        $name = $array[0];
1117    }
1118   
1119    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1120    }
1121    return $all_text;
1122}
1123
1124
11251;
Note: See TracBrowser for help on using the browser.