root/main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm @ 24829

Revision 24829, 31.3 KB (checked in by ak19, 8 years ago)

Changes to bat files and perl code to deal with brackets in (Windows) filepath. Also checked winmake.bat files to see if changes were needed there. These changes go together with the commits 24826 to 24828 for gems.bat, and commit 24820 on makegs2.bat.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60     'deft' => "",
61    'hiddengli' => "no"},
62       { 'name' => "output_handle",
63    'desc' => "{BasPlugout.output_handle}",
64    'type' => "string",
65        'deft' =>  'STDERR',
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68       { 'name' => "verbosity",
69    'desc' => "{BasPlugout.verbosity}",
70    'type' => "int",
71        'deft' =>  "0",
72    'reqd' => "no", 
73        'hiddengli' => "no"},
74       { 'name' => "gzip_output",
75    'desc' => "{BasPlugout.gzip_output}",
76    'type' => "flag",
77    'reqd' => "no", 
78        'hiddengli' => "no"},
79       { 'name' => "debug",
80     'desc' => "{BasPlugout.debug}",
81     'type' => "flag",
82     'reqd' => "no",
83     'hiddengli' => "yes"}
84];
85
86my $options = { 'name'     => "BasePlugout",
87        'desc'     => "{BasPlugout.desc}",
88        'abstract' => "yes",
89        'inherits' => "no",
90        'args'     => $arguments};
91
92sub new
93{
94    my $class = shift (@_);
95
96    my ($plugoutlist,$args,$hashArgOptLists) = @_;
97    push(@$plugoutlist, $class);
98
99    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102    push(@{$hashArgOptLists->{"OptList"}},$options);
103
104    my $self = {};
105    $self->{'plugout_type'} = $class;
106    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107    $self->{"info_only"} = 0;
108
109    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110    # the args, just return the object. 
111    foreach my $strArg (@{$args})
112    {
113    if(defined $strArg && $strArg eq "-gsdlinfo")
114    {
115        $self->{"info_only"} = 1;
116        return bless $self, $class;
117    }
118    }
119   
120    delete $self->{"info_only"};
121   
122    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123    {
124    my $classTempClass = bless $self, $class;
125    print STDERR "<BadPlugout d=$plugout_name>\n";
126    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127    $classTempClass->print_txt_usage("");  # Use default resource bundle
128    die "\n";
129    }
130
131 
132    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
133    {
134    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135    if (!defined $full_file_path) {
136        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137        die "\n";
138    }
139    $self->{'xslt_file'} = $full_file_path;
140    }
141
142    $self->{'gs_count'} = 0;   
143
144    $self->{'keep_import_structure'} = 0;
145
146    return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152    my $self = shift(@_);
153    my $header = shift(@_);
154    my $high_level_information_only = shift(@_);
155   
156    # XML output is always in UTF-8
157    gsprintf::output_strings_in_UTF8;
158
159    if ($header) {
160    &PrintUsage::print_xml_header("plugout");
161    }
162    $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168    my $self = shift(@_);
169    my $high_level_information_only = shift(@_);
170
171    my $optionlistref = $self->{'option_list'};
172    my @optionlist = @$optionlistref;
173    my $plugoutoptions = shift(@$optionlistref);
174    return if (!defined($plugoutoptions));
175
176    gsprintf(STDERR, "<PlugoutInfo>\n");
177    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
178    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179    $desc =~ s/</&amp;lt;/g; # doubly escaped
180    $desc =~ s/>/&amp;gt;/g;
181    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
182    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184    unless (defined($high_level_information_only)) {
185    gsprintf(STDERR, "  <Arguments>\n");
186    if (defined($plugoutoptions->{'args'})) {
187        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188    }
189    gsprintf(STDERR, "  </Arguments>\n");
190
191    # Recurse up the plugout hierarchy
192    $self->print_xml();
193    }
194    gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200    my $self = shift(@_);
201
202    # Print the usage message for a plugout (recursively)
203    my $descoffset = $self->determine_description_offset(0);
204    $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209    my $self = shift(@_);
210    my $maxoffset = shift(@_);
211
212    my $optionlistref = $self->{'option_list'};
213    my @optionlist = @$optionlistref;
214    my $plugoutoptions = pop(@$optionlistref);
215    return $maxoffset if (!defined($plugoutoptions));
216
217    # Find the length of the longest option string of this download
218    my $plugoutargs = $plugoutoptions->{'args'};
219    if (defined($plugoutargs)) {
220    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221    if ($longest > $maxoffset) {
222        $maxoffset = $longest;
223    }
224    }
225
226    # Recurse up the download hierarchy
227    $maxoffset = $self->determine_description_offset($maxoffset);
228    $self->{'option_list'} = \@optionlist;
229    return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235    my $self = shift(@_);
236    my $descoffset = shift(@_);
237    my $isleafclass = shift(@_);
238
239    my $optionlistref = $self->{'option_list'};
240    my @optionlist = @$optionlistref;
241    my $plugoutoptions = shift(@$optionlistref);
242    return if (!defined($plugoutoptions));
243
244    my $plugoutname = $plugoutoptions->{'name'};
245    my $plugoutargs = $plugoutoptions->{'args'};
246    my $plugoutdesc = $plugoutoptions->{'desc'};
247
248    # Produce the usage information using the data structure above
249    if ($isleafclass) {
250    if (defined($plugoutdesc)) {
251        gsprintf(STDERR, "$plugoutdesc\n\n");
252    }
253    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254    }
255
256    # Display the download options, if there are some
257    if (defined($plugoutargs)) {
258    # Calculate the column offset of the option descriptions
259    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
260
261    if ($isleafclass) {
262        gsprintf(STDERR, " {common.specific_options}:\n");
263    }
264    else {
265        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266    }
267
268    # Display the download options
269    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270    }
271
272    # Recurse up the download hierarchy
273    $self->print_plugout_usage($descoffset, 0);
274    $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280      my ($strFunctionName,$strError) = @_;
281    {
282    print "Error occoured in BasePlugout.pm\n".
283        "In Function: ".$strFunctionName."\n".
284        "Error Message: ".$strError."\n";
285    exit(-1);
286    } 
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291    my $self = shift (@_);
292    my ($type, $metadata) = @_;
293
294    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295    $self->{'OIDtype'} = $type;
296    } else {
297    $self->{'OIDtype'} = "hash";
298    }
299    if ($type =~ /^assigned$/) {
300    if (defined $metadata) {
301        $self->{'OIDmetadata'} = $metadata;
302    } else {
303        $self->{'OIDmetadata'} = "dc.Identifier";
304    }
305    }
306}
307
308sub set_output_dir
309{
310    my $self = shift @_;
311    my ($output_dir) = @_;
312
313   $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318    my $self = shift @_;
319    my ($output_dir) = @_;
320
321    $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326    my $self = shift (@_);
327
328   return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333    my $self = shift (@_);
334
335    return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340    my $self = shift (@_);
341
342    return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348    my $self = shift (@_);
349
350    my ($output_file_name) = @_;
351     
352    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
353
354    return *OUTPUT;           
355}
356
357sub release_output_handler
358{
359    my $self = shift (@_);
360    my ($outhandler) = @_;
361
362    close($outhandler);
363
364}
365
366sub output_xml_header {
367    my $self = shift (@_);
368    my ($handle,$docroot,$nondoctype) = @_;
369   
370   
371    #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
372   
373    #For Dspace must be UTF in lower case
374    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
375   
376    if (!defined $nondoctype){
377    my $doctype = (defined $docroot) ? $docroot : "Section";
378
379    # Used to be '<!DOCTYPE Archive SYSTEM ...'
380   
381    print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
382    }
383
384    print $handle "<$docroot>\n" if defined $docroot;
385}
386
387sub output_xml_footer {
388    my $self = shift (@_);
389    my ($handle,$docroot) = @_;
390    print $handle "</$docroot>\n" if defined $docroot;
391}
392
393
394sub output_general_xml_header
395{
396    my $self = shift (@_);
397    my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
398   
399    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
400   
401    if (defined $opt_dtd) {
402        my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
403    print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
404    }
405
406    if (defined $docroot) {
407        my $full_docroot = $docroot;
408        if (defined $opt_attributes) {
409          $full_docroot .= " $opt_attributes";
410        }
411
412        print $handle "<$full_docroot>\n"
413      }
414}
415
416sub output_general_xml_footer
417{
418    output_xml_footer(@_);
419}
420
421
422sub process {
423    my $self = shift (@_);
424    my ($doc_obj) = @_;
425   
426    # for OAI purposes
427    $doc_obj->set_lastmodified();
428    $doc_obj->set_oailastmodified();
429
430     if ($self->{'group_size'} > 1) {
431     $self->group_process ($doc_obj);
432    return;
433    }
434
435    my $OID = $doc_obj->get_OID();
436    $OID = "NULL" unless defined $OID;     
437
438    my $top_section = $doc_obj->get_top_section();
439
440    #get document's directory
441    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
442   
443    my $output_info = $self->{'output_info'};
444    return if (!defined $output_info);
445     
446    ##############################
447    # call subclass' saveas method
448    ##############################
449    $self->saveas($doc_obj,$doc_dir);
450    $self->archiveinf_db($doc_obj,$doc_dir);
451
452}
453
454sub store_output_info_reference {
455    my $self = shift (@_);
456    my ($doc_obj) = @_;
457
458    my $output_info = $self->{'output_info'};
459    my $metaname = $self->{'sortmeta'};
460    if (!defined $metaname || $metaname !~ /\S/) {
461    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
462    return;
463    }
464   
465    my $metadata = "";
466    my $top_section = $doc_obj->get_top_section();
467   
468    my @commameta_list = split(/,/, $metaname);
469    foreach my $cmn (@commameta_list) {
470    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
471    if ($meta) {
472        # do remove prefix/suffix - this will apply to all values
473        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
474        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
475        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
476        $metadata .= $meta if ($meta);
477    }
478    }
479
480    # store reference in the output_info     
481    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
482   
483}
484
485sub group_process {
486
487    my $self = shift (@_);
488    my ($doc_obj) = @_;
489   
490    my $OID = $doc_obj->get_OID();
491    $OID = "NULL" unless defined $OID;
492
493    my $groupsize = $self->{'group_size'};
494    my $gs_count = $self->{'gs_count'};
495    my $open_new_file = (($gs_count % $groupsize)==0);
496    my $outhandle = $self->{'output_handle'};
497
498    # opening a new file, or document has assoicated files => directory needed
499    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
500         
501        # The directory the archive file (doc.xml) and all associated files
502        # should end up in
503        my $doc_dir;
504        # If we've determined its time for a new file, open it now
505        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
506          {
507            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
508            # only if opening new file
509        my $output_dir = $self->get_output_dir();
510        &util::mk_all_dir ($output_dir) unless -e $output_dir;
511        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
512        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
513       
514        if ($gs_count>0)
515        {
516        return if (!$self->close_file_output());
517        }
518
519        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
520           
521
522        $self->{'gs_filename'} = $doc_file;
523        $self->{'short_doc_file'} = $short_doc_file;
524        $self->{'gs_OID'} = $OID;
525            $self->{'gs_doc_dir'} = $doc_dir;
526
527        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
528    }
529        # Otherwise load the same archive document directory used last time
530        else
531          {
532            $doc_dir = $self->{'gs_doc_dir'};
533          }
534
535    # copy all the associated files, add this information as metadata
536    # to the document
537        print $outhandle "Writing associated files to $doc_dir\n";
538    $self->process_assoc_files ($doc_obj, $doc_dir);
539
540    # look up 'gsdlmetafile' metadata and store that information
541    # explicitly in $doc_obj
542    $self->process_metafiles_metadata ($doc_obj);
543    }
544
545    # save this document
546    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
547    print GROUPPROCESS $section_text;
548
549    $self->{'gs_count'}++;
550}
551
552
553sub saveas {
554    my $self = shift (@_);
555   
556    die "Basplug::saveas function must be implemented in sub classes\n";
557}
558
559sub get_doc_dir {
560    my $self = shift (@_);
561    my ($OID, $source_filename) = @_;
562
563    my $working_dir  = $self->get_output_dir();
564    my $working_info = $self->{'output_info'};
565    return if (!defined $working_info);
566
567    my $doc_info = $working_info->get_info($OID);
568    my $doc_dir = '';
569
570    if (defined $doc_info && scalar(@$doc_info) >= 1)
571    {
572    # This OID already has an archives directory, so use it again
573    $doc_dir = $doc_info->[0];
574    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
575    }
576    elsif ($self->{'keep_import_structure'})
577    {
578    $source_filename = &File::Basename::dirname($source_filename);
579    $source_filename =~ s/[\\\/]+/\//g;
580    $source_filename =~ s/\/$//;
581
582        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
583    }
584
585    # We have to use a new archives directory for this document
586    if ($doc_dir eq "")
587    {
588    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
589    }
590
591    if (!defined $self->{'group'} || !$self->{'group'}){
592    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
593    }
594
595    return $doc_dir;
596}
597
598sub get_new_doc_dir{
599   my $self = shift (@_); 
600   my($working_info,$working_dir,$OID) = @_;     
601   
602   
603   my $doc_dir = "";
604   my $doc_dir_rest = $OID;
605
606   # remove any \ and / from the OID
607   $doc_dir_rest =~ s/[\\\/]//g;
608
609   # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
610   $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
611
612   my $doc_dir_num = 0;
613
614   do {
615       $doc_dir .= "/" if $doc_dir_num > 0;
616       if ($doc_dir_rest =~ s/^(.{1,8})//) {
617       $doc_dir .= $1;
618       $doc_dir_num++;
619       }
620   } while ($doc_dir_rest ne "" &&
621        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
622         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
623   my $i = 1;
624   my $doc_dir_base = $doc_dir;
625   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
626       $doc_dir = "$doc_dir_base-$i";
627       $i++;
628   }
629         
630   return "$doc_dir.dir";
631}
632
633sub process_assoc_files {
634    my $self = shift (@_);
635    my ($doc_obj, $doc_dir, $handle) = @_;
636
637    my $outhandle = $self->{'output_handle'};
638   
639    my $output_dir = $self->get_output_dir();
640    return if (!defined $output_dir);
641
642    &util::mk_all_dir ($output_dir) unless -e $output_dir;
643     
644    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
645    &util::mk_all_dir ($working_dir) unless -e $working_dir;
646
647    my @assoc_files = ();
648    my $filename;;
649
650    my $source_filename = $doc_obj->get_source_filename();
651
652    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
653
654    if (defined $collect_dir) {
655    my $dirsep_regexp = &util::get_os_dirsep();
656
657    if ($collect_dir !~ /$dirsep_regexp$/) {
658        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
659    }
660
661    # This test is never going to fail on Windows -- is this a problem?
662     
663    if ($source_filename !~ /^$dirsep_regexp/) {
664        $source_filename = &util::filename_cat($collect_dir, $source_filename);
665    }
666    }
667
668
669    # set the assocfile path (even if we have no assoc files - need this for lucene)
670    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
671                     "assocfilepath",
672                     "$doc_dir");
673    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
674    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
675    $dir = "" unless defined $dir;
676       
677    my $utf8_real_filename = $assoc_file_rec->[0];
678
679    # for some reasons the image associate file has / before the full path
680    $utf8_real_filename =~ s/^\\(.*)/$1/i;
681
682##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
683    my $real_filename = $utf8_real_filename;
684    $real_filename = &util::downgrade_if_dos_filename($real_filename);
685
686    if (-e $real_filename) {
687
688        $filename = &util::filename_cat($working_dir, $afile);
689
690        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
691       
692        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
693                     "gsdlassocfile",
694                     "$afile:$assoc_file_rec->[2]:$dir");
695    } elsif ($self->{'verbosity'} > 1) {
696        print $outhandle "BasePlugout::process couldn't copy the associated file " .
697        "$real_filename to $afile\n";
698    }
699    }
700}
701
702
703sub process_metafiles_metadata
704{
705    my $self = shift (@_);
706    my ($doc_obj) = @_;
707
708    my $top_section = $doc_obj->get_top_section();
709    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
710
711    foreach my $metafile_pair (@$metafiles) {
712    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
713
714    $doc_obj->metadata_file($full_metafile,$metafile);
715    }
716
717    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
718}
719
720sub archiveinf_files_to_field
721{
722    my $self = shift(@_);
723    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
724
725    foreach my $file_rec (@$files) {
726    my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
727    my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
728    # for some reasons the image associate file has / before the full path
729    $real_filename =~ s/^\\(.*)/$1/i;
730
731    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
732
733    if (-e $raw_filename) {
734
735#       if (defined $collect_dir) {
736#       my $collect_dir_re_safe = $collect_dir;
737#       $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
738#       $collect_dir_re_safe =~ s/\./\\./g;##
739
740#       $real_filename =~ s/^$collect_dir_re_safe//;
741#       }
742       
743        if (defined $reverse_lookups) {
744        $reverse_lookups->{$real_filename} = 1;
745        }
746###     push(@{$oid_files->{$field}},$full_file);
747        push(@{$oid_files->{$field}},$raw_filename);
748    }
749    else {
750        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
751    }
752    }
753}
754
755sub archiveinf_db
756{
757    my $self = shift (@_);
758    my ($doc_obj) = @_;
759
760    my $verbosity = $self->{'verbosity'};
761
762    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
763    if (defined $collect_dir) {
764    my $dirsep_regexp = &util::get_os_dirsep();
765
766    if ($collect_dir !~ /$dirsep_regexp$/) {
767        # ensure there is a slash at the end
768        $collect_dir .= &util::get_dirsep();
769    }
770    }
771
772    my $oid = $doc_obj->get_OID();
773    my $source_filename = $doc_obj->get_unmodified_source_filename();
774    my $working_info = $self->{'output_info'};
775    my $doc_info = $working_info->get_info($oid);
776
777    my ($doc_file,$index_status,$sortmeta) = @$doc_info;
778    # doc_file is the path to the archive doc.xml. Make sure it has unix
779    # slashes, then if the collection is copied to linux, it can be built without reimport
780    $doc_file =~ s/\\/\//g;
781    my $oid_files = { 'doc-file' => $doc_file,
782              'index-status' => $index_status,
783              'src-file' => $source_filename,
784              'sort-meta' => $sortmeta,
785              'assoc-file' => [],
786              'meta-file'  => [] };
787   
788    my $reverse_lookups = { $source_filename => "1" };
789
790
791    $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
792                     $collect_dir,$oid_files,$reverse_lookups);
793
794
795    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
796                     $collect_dir,$oid_files);
797
798    # Get the infodbtype value for this collection from the arcinfo object
799    my $infodbtype = $self->{'output_info'}->{'infodbtype'};
800    my $output_dir = $self->{'output_dir'};
801
802    my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
803    my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
804
805    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
806
807    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
808    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
809    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
810    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
811
812    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
813    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
814    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
815
816    foreach my $rl (keys %$reverse_lookups) {
817    $working_info->add_reverseinfo($rl,$oid);
818    } 
819
820    # meta files not set in reverese entry, but need to set the metadata flag
821    if (defined $doc_obj->get_meta_files()) {
822    foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
823        my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
824        $working_info->set_meta_file_flag($full_file);
825    }
826    }
827}
828
829
830sub set_sortmeta {
831    my $self = shift (@_);
832    my ($sortmeta, $removeprefix, $removesuffix) = @_;
833   
834    $self->{'sortmeta'} = $sortmeta;
835    if (defined ($removeprefix) && $removeprefix ) {
836    $removeprefix =~ s/^\^//; # don't need a leading ^
837    $self->{'removeprefix'} = $removeprefix;
838    }
839    if (defined ($removesuffix) && $removesuffix) {
840    $removesuffix =~ s/\$$//; # don't need a trailing $
841    $self->{'removesuffix'} = $removesuffix;
842    }
843}
844
845sub open_xslt_pipe
846{
847    my $self = shift @_;
848    my ($output_file_name, $xslt_file)=@_;
849
850    return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
851   
852    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
853
854    my $mapping_file_path = "";
855
856    if ($ENV{'GSDLOS'} eq "windows"){
857    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
858    # this file:/// bit didn't work for me on windows XP
859    #$xslt_file = "\"file:///".$xslt_file."\"";
860    #$mapping_file_path = "\"file:///";
861    }
862    else{
863    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
864    }
865
866
867    $java_class_path = "\"".$java_class_path."\"";
868
869    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
870
871    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
872    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
873    $cmd .= "-m $mapping_file_path";
874    }
875   
876    open(*XMLWRITER, $cmd)
877    or die "can't open pipe to xslt: $!";
878
879   
880    $self->{'xslt_writer'} = *XMLWRITER;
881
882    print XMLWRITER "<?DocStart?>\n";       
883    print XMLWRITER "$output_file_name\n";
884
885 
886  }
887 
888
889sub close_xslt_pipe
890{
891  my $self = shift @_;
892
893 
894  return unless defined $self->{'xslt_writer'} ;
895   
896  my $xsltwriter = $self->{'xslt_writer'};
897 
898  print $xsltwriter "<?DocEnd?>\n";
899  close($xsltwriter);
900
901  undef $self->{'xslt_writer'};
902
903}
904
905sub close_file_output
906{
907    my ($self) = @_;
908 
909    # make sure that the handle has been opened - it won't be if we failed
910    # to import any documents...
911    if (defined(fileno(GROUPPROCESS))) {
912    $self->output_xml_footer('GROUPPROCESS','Archive');   
913    close GROUPPROCESS;
914    }
915
916    my $OID = $self->{'gs_OID'};
917    my $short_doc_file = $self->{'short_doc_file'};
918   
919    if ($self->{'gzip'}) {
920    my $doc_file = $self->{'gs_filename'};
921    `gzip $doc_file`;
922    $doc_file .= ".gz";
923    $short_doc_file .= ".gz";
924    if (!-e $doc_file) {
925         my $outhandle = $self->{'output_handle'};
926        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
927        return 0;
928    }
929    }
930
931    # store reference in output_info
932    my $output_info = $self->{'output_info'};
933    return 0 if (!defined $output_info);
934    $output_info->add_info($OID, $short_doc_file, undef, undef);
935    return 1;
936}
937
938
939#the subclass should implement this method if is_group method could return 1.
940sub close_group_output{
941   my $self = shift (@_);       
942}
943
944sub is_group {
945    my $self = shift (@_);
946    return 0;       
947}
948
949my $dc_set = { Title => 1,       
950           Creator => 1,
951           Subject => 1,
952           Description => 1,
953           Publisher => 1,
954           Contributor => 1,
955           Date => 1,
956           Type => 1,
957           Format => 1,
958           Identifier => 1,
959           Source => 1,
960           Language => 1,
961           Relation => 1,
962           Coverage => 1,
963           Rights => 1};
964
965
966# returns an XML representation of the dublin core metadata
967# if dc meta is not found, try ex meta
968# This method is not used by the DSpacePlugout, which has its
969# own method to save its dc metadata
970sub get_dc_metadata {
971    my $self = shift(@_);
972    my ($doc_obj, $section, $version) = @_;
973   
974    # build up string of dublin core metadata
975    $section="" unless defined $section;
976   
977    my $section_ptr = $doc_obj->_lookup_section($section);
978    return "" unless defined $section_ptr;
979
980
981    my $explicit_dc = {};
982    my $explicit_ex_dc = {};
983    my $explicit_ex = {};
984
985    my $all_text="";
986   
987    # We want high quality dc metadata to go in first, so we store all the
988    # assigned dc.* values first. Then, for all those dc metadata names in
989    # the official dc set that are as yet unassigned, we look to see whether
990    # embedded ex.dc.* metadata has defined some values for them. If not,
991    # then for the same missing dc metadata names, we look in ex metadata.
992
993    foreach my $data (@{$section_ptr->{'metadata'}}){
994    my $escaped_value = &docprint::escape_text($data->[1]);
995    if ($data->[0]=~ m/^dc\./) {
996        $data->[0] =~ tr/[A-Z]/[a-z]/;
997
998        $data->[0] =~ m/^dc\.(.*)/;
999        my $dc_element =  $1;
1000
1001        if (!defined $explicit_dc->{$dc_element}) {
1002        $explicit_dc->{$dc_element} = [];
1003        }
1004        push(@{$explicit_dc->{$dc_element}},$escaped_value);
1005
1006        if (defined $version && ($version eq "oai_dc")) {
1007        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1008        }
1009        else {
1010        # qualifier???
1011        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1012        }
1013
1014    } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1015        $data->[0] =~ m/^ex\.dc\.(.*)/;
1016        my $ex_dc_element = $1;
1017        my $lc_ex_dc_element = lc($ex_dc_element);
1018
1019        # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1020        if (defined $dc_set->{$ex_dc_element}) {
1021        if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1022            $explicit_ex_dc->{$lc_ex_dc_element} = [];
1023        }
1024        push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1025        }
1026    }
1027    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1028        $data->[0] =~ m/^(ex\.)?(.*)/;
1029        my $ex_element = $2;
1030        my $lc_ex_element = lc($ex_element);
1031
1032        if (defined $dc_set->{$ex_element}) {
1033        if (!defined $explicit_ex->{$lc_ex_element}) {
1034            $explicit_ex->{$lc_ex_element} = [];
1035        }
1036        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1037        }
1038    }
1039    }
1040
1041    # go through dc_set and for any element *not* defined in explicit_dc
1042    # that does exist in explicit_ex, add it in as metadata
1043    foreach my $k ( keys %$dc_set ) {
1044    my $lc_k = lc($k);
1045
1046    if (!defined $explicit_dc->{$lc_k}) {
1047        # try to find if ex.dc.* defines this dc.* meta,
1048        # if not, then look for whether there's an ex.* equivalent
1049
1050        if (defined $explicit_ex_dc->{$lc_k}) {
1051        foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1052            my $dc_element    = $lc_k;
1053            my $escaped_value = $v;
1054           
1055            if (defined $version && ($version eq "oai_dc")) {
1056            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1057            }
1058            else {
1059            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1060            }           
1061        }
1062        } elsif (defined $explicit_ex->{$lc_k}) {
1063        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1064            my $dc_element    = $lc_k;
1065            my $escaped_value = $v;
1066
1067            if (defined $version && ($version eq "oai_dc")) {
1068            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1069            }
1070            else {
1071            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1072            }
1073        }
1074        }
1075    }
1076    }
1077
1078    if ($all_text eq "") {
1079    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1080    }   
1081    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1082
1083    return $all_text;
1084}
1085
1086# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1087# This method was apparently added by Jeffrey and committed by Shaoqun.
1088# But we don't know why it was added, so not using it anymore.
1089sub new_get_dc_metadata {
1090   
1091    my $self = shift(@_);
1092    my ($doc_obj, $section, $version) = @_;
1093
1094    # build up string of dublin core metadata
1095    $section="" unless defined $section;
1096   
1097    my $section_ptr=$doc_obj->_lookup_section($section);
1098    return "" unless defined $section_ptr;
1099
1100    my $all_text = "";
1101    foreach my $data (@{$section_ptr->{'metadata'}}){
1102    my $escaped_value = &docprint::escape_text($data->[1]);
1103    my $dc_element =  $data->[0];
1104   
1105    my @array = split('\.',$dc_element);
1106    my ($type,$name);
1107
1108    if(defined $array[1])
1109    {
1110        $type = $array[0];
1111        $name = $array[1];
1112    }
1113    else
1114    {
1115        $type = "ex";
1116        $name = $array[0];
1117    }
1118   
1119    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1120    }
1121    return $all_text;
1122}
1123
1124
11251;
Note: See TracBrowser for help on using the browser.