root/main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm @ 22232

Revision 22232, 28.2 KB (checked in by mdewsnip, 10 years ago)

New OAIMetadataXMLPlugin.pm for extracting information from OAI servers where metadata.xml files specify what to download from the OAI server. Done for Koha integration, but may be useful for other OAI servers. By Jeffrey Ke at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60     'deft' => "",
61    'hiddengli' => "no"},
62       { 'name' => "output_handle",
63    'desc' => "{BasPlugout.output_handle}",
64    'type' => "string",
65        'deft' =>  'STDERR',
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68       { 'name' => "verbosity",
69    'desc' => "{BasPlugout.verbosity}",
70    'type' => "int",
71        'deft' =>  "0",
72    'reqd' => "no", 
73        'hiddengli' => "no"},
74       { 'name' => "gzip_output",
75    'desc' => "{BasPlugout.gzip_output}",
76    'type' => "flag",
77    'reqd' => "no", 
78        'hiddengli' => "no"},
79       { 'name' => "debug",
80     'desc' => "{BasPlugout.debug}",
81     'type' => "flag",
82     'reqd' => "no",
83     'hiddengli' => "yes"}
84];
85
86my $options = { 'name'     => "BasePlugout",
87        'desc'     => "{BasPlugout.desc}",
88        'abstract' => "yes",
89        'inherits' => "no",
90        'args'     => $arguments};
91
92sub new
93{
94    my $class = shift (@_);
95
96    my ($plugoutlist,$args,$hashArgOptLists) = @_;
97    push(@$plugoutlist, $class);
98
99    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102    push(@{$hashArgOptLists->{"OptList"}},$options);
103
104    my $self = {};
105    $self->{'plugout_type'} = $class;
106    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107    $self->{"info_only"} = 0;
108
109    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110    # the args, just return the object. 
111    foreach my $strArg (@{$args})
112    {
113    if(defined $strArg && $strArg eq "-gsdlinfo")
114    {
115        $self->{"info_only"} = 1;
116        return bless $self, $class;
117    }
118    }
119   
120    delete $self->{"info_only"};
121   
122    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123    {
124    my $classTempClass = bless $self, $class;
125    print STDERR "<BadPlugout d=$plugout_name>\n";
126    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127    $classTempClass->print_txt_usage("");  # Use default resource bundle
128    die "\n";
129    }
130
131 
132    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
133    {
134    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135    if (!defined $full_file_path) {
136        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137        die "\n";
138    }
139    $self->{'xslt_file'} = $full_file_path;
140    }
141
142    $self->{'gs_count'} = 0;   
143
144    $self->{'keep_import_structure'} = 0;
145
146    return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152    my $self = shift(@_);
153    my $header = shift(@_);
154    my $high_level_information_only = shift(@_);
155   
156    # XML output is always in UTF-8
157    gsprintf::output_strings_in_UTF8;
158
159    if ($header) {
160    &PrintUsage::print_xml_header("plugout");
161    }
162    $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168    my $self = shift(@_);
169    my $high_level_information_only = shift(@_);
170
171    my $optionlistref = $self->{'option_list'};
172    my @optionlist = @$optionlistref;
173    my $plugoutoptions = shift(@$optionlistref);
174    return if (!defined($plugoutoptions));
175
176    gsprintf(STDERR, "<PlugoutInfo>\n");
177    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
178    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179    $desc =~ s/</&amp;lt;/g; # doubly escaped
180    $desc =~ s/>/&amp;gt;/g;
181    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
182    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184    unless (defined($high_level_information_only)) {
185    gsprintf(STDERR, "  <Arguments>\n");
186    if (defined($plugoutoptions->{'args'})) {
187        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188    }
189    gsprintf(STDERR, "  </Arguments>\n");
190
191    # Recurse up the plugout hierarchy
192    $self->print_xml();
193    }
194    gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200    my $self = shift(@_);
201
202    # Print the usage message for a plugout (recursively)
203    my $descoffset = $self->determine_description_offset(0);
204    $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209    my $self = shift(@_);
210    my $maxoffset = shift(@_);
211
212    my $optionlistref = $self->{'option_list'};
213    my @optionlist = @$optionlistref;
214    my $plugoutoptions = pop(@$optionlistref);
215    return $maxoffset if (!defined($plugoutoptions));
216
217    # Find the length of the longest option string of this download
218    my $plugoutargs = $plugoutoptions->{'args'};
219    if (defined($plugoutargs)) {
220    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221    if ($longest > $maxoffset) {
222        $maxoffset = $longest;
223    }
224    }
225
226    # Recurse up the download hierarchy
227    $maxoffset = $self->determine_description_offset($maxoffset);
228    $self->{'option_list'} = \@optionlist;
229    return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235    my $self = shift(@_);
236    my $descoffset = shift(@_);
237    my $isleafclass = shift(@_);
238
239    my $optionlistref = $self->{'option_list'};
240    my @optionlist = @$optionlistref;
241    my $plugoutoptions = shift(@$optionlistref);
242    return if (!defined($plugoutoptions));
243
244    my $plugoutname = $plugoutoptions->{'name'};
245    my $plugoutargs = $plugoutoptions->{'args'};
246    my $plugoutdesc = $plugoutoptions->{'desc'};
247
248    # Produce the usage information using the data structure above
249    if ($isleafclass) {
250    if (defined($plugoutdesc)) {
251        gsprintf(STDERR, "$plugoutdesc\n\n");
252    }
253    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254    }
255
256    # Display the download options, if there are some
257    if (defined($plugoutargs)) {
258    # Calculate the column offset of the option descriptions
259    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
260
261    if ($isleafclass) {
262        gsprintf(STDERR, " {common.specific_options}:\n");
263    }
264    else {
265        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266    }
267
268    # Display the download options
269    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270    }
271
272    # Recurse up the download hierarchy
273    $self->print_plugout_usage($descoffset, 0);
274    $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280      my ($strFunctionName,$strError) = @_;
281    {
282    print "Error occoured in BasePlugout.pm\n".
283        "In Function: ".$strFunctionName."\n".
284        "Error Message: ".$strError."\n";
285    exit(-1);
286    } 
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291    my $self = shift (@_);
292    my ($type, $metadata) = @_;
293
294    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295    $self->{'OIDtype'} = $type;
296    } else {
297    $self->{'OIDtype'} = "hash";
298    }
299    if ($type =~ /^assigned$/) {
300    if (defined $metadata) {
301        $self->{'OIDmetadata'} = $metadata;
302    } else {
303        $self->{'OIDmetadata'} = "dc.Identifier";
304    }
305    }
306}
307
308sub set_output_dir
309{
310    my $self = shift @_;
311    my ($output_dir) = @_;
312
313   $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318    my $self = shift @_;
319    my ($output_dir) = @_;
320
321    $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326    my $self = shift (@_);
327
328   return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333    my $self = shift (@_);
334
335    return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340    my $self = shift (@_);
341
342    return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348    my $self = shift (@_);
349
350    my ($output_file_name) = @_;
351     
352    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
353
354    return *OUTPUT;           
355}
356
357sub release_output_handler
358{
359    my $self = shift (@_);
360    my ($outhandler) = @_;
361
362    close($outhandler);
363
364}
365
366sub output_xml_header {
367    my $self = shift (@_);
368    my ($handle,$docroot,$nondoctype) = @_;
369   
370    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
371   
372    if (!defined $nondoctype){
373    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
374    }
375
376    print $handle "<$docroot>\n" if defined $docroot;
377}
378
379sub output_xml_footer {
380    my $self = shift (@_);
381    my ($handle,$docroot) = @_;
382    print $handle "</$docroot>\n" if defined $docroot;
383}
384
385sub process {
386    my $self = shift (@_);
387    my ($doc_obj) = @_;
388   
389    $doc_obj->set_lastmodified();
390
391     if ($self->{'group_size'} > 1) {
392     $self->group_process ($doc_obj);
393    return;
394    }
395
396    my $OID = $doc_obj->get_OID();
397    $OID = "NULL" unless defined $OID;     
398
399    my $top_section = $doc_obj->get_top_section();
400
401    #get document's directory
402    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
403   
404    my $output_info = $self->{'output_info'};
405    return if (!defined $output_info);
406     
407    ##############################
408    # call subclass' saveas method
409    ##############################
410    $self->saveas($doc_obj,$doc_dir);
411    $self->archiveinf_db($doc_obj,$doc_dir);
412
413}
414
415sub store_output_info_reference {
416    my $self = shift (@_);
417    my ($doc_obj) = @_;
418
419    my $output_info = $self->{'output_info'};
420    my $metaname = $self->{'sortmeta'};
421    if (!defined $metaname || $metaname !~ /\S/) {
422    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
423    return;
424    }
425   
426    my $metadata = "";
427    my $top_section = $doc_obj->get_top_section();
428   
429    my @commameta_list = split(/,/, $metaname);
430    foreach my $cmn (@commameta_list) {
431    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
432    if ($meta) {
433        # do remove prefix/suffix - this will apply to all values
434        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
435        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
436        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
437        $metadata .= $meta if ($meta);
438    }
439    }
440
441    # store reference in the output_info     
442    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
443   
444}
445
446sub group_process {
447
448    my $self = shift (@_);
449    my ($doc_obj) = @_;
450   
451    my $OID = $doc_obj->get_OID();
452    $OID = "NULL" unless defined $OID;
453
454    my $groupsize = $self->{'group_size'};
455    my $gs_count = $self->{'gs_count'};
456    my $open_new_file = (($gs_count % $groupsize)==0);
457    my $outhandle = $self->{'output_handle'};
458
459    # opening a new file, or document has assoicated files => directory needed
460    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
461         
462        # The directory the archive file (doc.xml) and all associated files
463        # should end up in
464        my $doc_dir;
465        # If we've determined its time for a new file, open it now
466        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
467          {
468            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
469            # only if opening new file
470        my $output_dir = $self->get_output_dir();
471        &util::mk_all_dir ($output_dir) unless -e $output_dir;
472        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
473        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
474       
475        if ($gs_count>0)
476        {
477        return if (!$self->close_file_output());
478        }
479
480        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
481           
482
483        $self->{'gs_filename'} = $doc_file;
484        $self->{'short_doc_file'} = $short_doc_file;
485        $self->{'gs_OID'} = $OID;
486            $self->{'gs_doc_dir'} = $doc_dir;
487
488        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
489    }
490        # Otherwise load the same archive document directory used last time
491        else
492          {
493            $doc_dir = $self->{'gs_doc_dir'};
494          }
495
496    # copy all the associated files, add this information as metadata
497    # to the document
498        print $outhandle "Writing associated files to $doc_dir\n";
499    $self->process_assoc_files ($doc_obj, $doc_dir);
500
501    # look up 'gsdlmetafile' metadata and store that information
502    # explicitly in $doc_obj
503    $self->process_metafiles_metadata ($doc_obj);
504    }
505
506    # save this document
507    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
508    print GROUPPROCESS $section_text;
509
510    $self->{'gs_count'}++;
511}
512
513
514sub saveas {
515    my $self = shift (@_);
516   
517    die "Basplug::saveas function must be implemented in sub classes\n";
518}
519
520sub get_doc_dir {
521    my $self = shift (@_);
522    my ($OID, $source_filename) = @_;
523
524    my $working_dir  = $self->get_output_dir();
525    my $working_info = $self->{'output_info'};
526    return if (!defined $working_info);
527
528    my $doc_info = $working_info->get_info($OID);
529    my $doc_dir = '';
530
531    if (defined $doc_info && scalar(@$doc_info) >= 1)
532    {
533    # This OID already has an archives directory, so use it again
534    $doc_dir = $doc_info->[0];
535    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
536    }
537    elsif ($self->{'keep_import_structure'})
538    {
539    $source_filename = &File::Basename::dirname($source_filename);
540    $source_filename =~ s/[\\\/]+/\//g;
541    $source_filename =~ s/\/$//;
542
543        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
544    }
545
546    # We have to use a new archives directory for this document
547    if ($doc_dir eq "")
548    {
549    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
550    }
551
552    if (!defined $self->{'group'} || !$self->{'group'}){
553    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
554    }
555
556    return $doc_dir;
557}
558
559sub get_new_doc_dir{
560   my $self = shift (@_); 
561   my($working_info,$working_dir,$OID) = @_;     
562   
563   
564   my $doc_dir = "";
565   my $doc_dir_rest = $OID;
566
567   # remove any \ and / from the OID
568   $doc_dir_rest =~ s/[\\\/]//g;
569
570   # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
571   $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
572
573   my $doc_dir_num = 0;
574
575   do {
576       $doc_dir .= "/" if $doc_dir_num > 0;
577       if ($doc_dir_rest =~ s/^(.{1,8})//) {
578       $doc_dir .= $1;
579       $doc_dir_num++;
580       }
581   } while ($doc_dir_rest ne "" &&
582        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
583         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
584   my $i = 1;
585   my $doc_dir_base = $doc_dir;
586   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
587       $doc_dir = "$doc_dir_base-$i";
588       $i++;
589   }
590         
591   return "$doc_dir.dir";
592}
593
594sub process_assoc_files {
595    my $self = shift (@_);
596    my ($doc_obj, $doc_dir, $handle) = @_;
597
598    my $outhandle = $self->{'output_handle'};
599   
600    my $output_dir = $self->get_output_dir();
601    return if (!defined $output_dir);
602
603    &util::mk_all_dir ($output_dir) unless -e $output_dir;
604     
605    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
606    &util::mk_all_dir ($working_dir) unless -e $working_dir;
607
608    my @assoc_files = ();
609    my $filename;;
610
611    my $source_filename = $doc_obj->get_source_filename();
612
613    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
614
615    if (defined $collect_dir) {
616    my $dirsep_regexp = &util::get_os_dirsep();
617
618    if ($collect_dir !~ /$dirsep_regexp$/) {
619        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
620    }
621
622    # This test is never going to fail on Windows -- is this a problem?
623     
624    if ($source_filename !~ /^$dirsep_regexp/) {
625        $source_filename = &util::filename_cat($collect_dir, $source_filename);
626    }
627    }
628
629
630    # set the assocfile path (even if we have no assoc files - need this for lucene)
631    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
632                     "assocfilepath",
633                     "$doc_dir");
634    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
635    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
636    $dir = "" unless defined $dir;
637       
638   
639    my $real_filename = $assoc_file_rec->[0];
640    # for some reasons the image associate file has / before the full path
641    $real_filename =~ s/^\\(.*)/$1/i;
642    if (-e $real_filename) {
643
644        $filename = &util::filename_cat($working_dir, $afile);
645
646        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
647       
648        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
649                     "gsdlassocfile",
650                     "$afile:$assoc_file_rec->[2]:$dir");
651    } elsif ($self->{'verbosity'} > 2) {
652        print $outhandle "BasePlugout::process couldn't copy the associated file " .
653        "$real_filename to $afile\n";
654    }
655    }
656}
657
658
659sub process_metafiles_metadata
660{
661    my $self = shift (@_);
662    my ($doc_obj) = @_;
663
664    my $top_section = $doc_obj->get_top_section();
665    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
666
667    foreach my $metafile_pair (@$metafiles) {
668    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
669
670    $doc_obj->metadata_file($full_metafile,$metafile);
671    }
672
673    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
674}
675
676sub archiveinf_files_to_field
677{
678    my $self = shift(@_);
679    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
680
681    foreach my $file_rec (@$files) {
682    my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
683    my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
684    # for some reasons the image associate file has / before the full path
685    $real_filename =~ s/^\\(.*)/$1/i;
686    if (-e $real_filename) {
687
688#       if (defined $collect_dir) {
689#       my $collect_dir_re_safe = $collect_dir;
690#       $collect_dir_re_safe =~ s/\\/\\\\/g;
691#       $collect_dir_re_safe =~ s/\./\\./g;##
692
693#       $real_filename =~ s/^$collect_dir_re_safe//;
694#       }
695        if (defined $reverse_lookups) {
696        $reverse_lookups->{$real_filename} = 1;
697        }
698        push(@{$oid_files->{$field}},$full_file);
699    }
700    else {
701        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
702    }
703    }
704}
705
706sub archiveinf_db
707{
708    my $self = shift (@_);
709    my ($doc_obj) = @_;
710
711    my $verbosity = $self->{'verbosity'};
712
713    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
714    if (defined $collect_dir) {
715    my $dirsep_regexp = &util::get_os_dirsep();
716
717    if ($collect_dir !~ /$dirsep_regexp$/) {
718        # ensure there is a slash at the end
719        $collect_dir .= &util::get_dirsep();
720    }
721    }
722
723    my $oid = $doc_obj->get_OID();
724    my $source_filename = $doc_obj->get_unmodified_source_filename();
725
726    my $working_info = $self->{'output_info'};
727    my $doc_info = $working_info->get_info($oid);
728
729    my ($doc_file,$index_status,$sortmeta) = @$doc_info;
730
731    my $oid_files = { 'doc-file' => $doc_file,
732              'index-status' => $index_status,
733              'src-file' => $source_filename,
734              'sort-meta' => $sortmeta,
735              'assoc-file' => [],
736              'meta-file'  => [] };
737   
738    my $reverse_lookups = { $source_filename => "1" };
739
740
741    $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
742                     $collect_dir,$oid_files,$reverse_lookups);
743
744
745    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
746                     $collect_dir,$oid_files);
747
748    # Get the infodbtype value for this collection from the arcinfo object
749    my $infodbtype = $self->{'output_info'}->{'infodbtype'};
750    my $output_dir = $self->{'output_dir'};
751
752    my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
753    my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
754
755    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
756
757    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
758    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
759    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
760    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
761
762    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
763    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
764    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
765
766    foreach my $rl (keys %$reverse_lookups) {
767    $working_info->add_reverseinfo($rl,$oid);
768    } 
769
770    # meta files not set in reverese entry, but need to set the metadata flag
771    if (defined $doc_obj->get_meta_files()) {
772    foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
773        my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
774        $working_info->set_meta_file_flag($full_file);
775    }
776    }
777}
778
779
780sub set_sortmeta {
781    my $self = shift (@_);
782    my ($sortmeta, $removeprefix, $removesuffix) = @_;
783   
784    $self->{'sortmeta'} = $sortmeta;
785    if (defined ($removeprefix) && $removeprefix ) {
786    $removeprefix =~ s/^\^//; # don't need a leading ^
787    $self->{'removeprefix'} = $removeprefix;
788    }
789    if (defined ($removesuffix) && $removesuffix) {
790    $removesuffix =~ s/\$$//; # don't need a trailing $
791    $self->{'removesuffix'} = $removesuffix;
792    }
793}
794
795sub open_xslt_pipe
796{
797    my $self = shift @_;
798    my ($output_file_name, $xslt_file)=@_;
799
800    return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
801   
802    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
803
804    my $mapping_file_path = "";
805
806    if ($ENV{'GSDLOS'} eq "windows"){
807    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
808    # this file:/// bit didn't work for me on windows XP
809    #$xslt_file = "\"file:///".$xslt_file."\"";
810    #$mapping_file_path = "\"file:///";
811    }
812    else{
813    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
814    }
815
816
817    $java_class_path = "\"".$java_class_path."\"";
818
819    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
820
821    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
822    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
823    $cmd .= "-m $mapping_file_path";
824    }
825   
826    open(*XMLWRITER, $cmd)
827    or die "can't open pipe to xslt: $!";
828
829   
830    $self->{'xslt_writer'} = *XMLWRITER;
831
832    print XMLWRITER "<?DocStart?>\n";       
833    print XMLWRITER "$output_file_name\n";
834
835 
836  }
837 
838
839sub close_xslt_pipe
840{
841  my $self = shift @_;
842
843 
844  return unless defined $self->{'xslt_writer'} ;
845   
846  my $xsltwriter = $self->{'xslt_writer'};
847 
848  print $xsltwriter "<?DocEnd?>\n";
849  close($xsltwriter);
850
851  undef $self->{'xslt_writer'};
852
853}
854
855sub close_file_output
856{
857    my ($self) = @_;
858 
859    # make sure that the handle has been opened - it won't be if we failed
860    # to import any documents...
861    if (defined(fileno(GROUPPROCESS))) {
862    $self->output_xml_footer('GROUPPROCESS','Archive');   
863    close GROUPPROCESS;
864    }
865
866    my $OID = $self->{'gs_OID'};
867    my $short_doc_file = $self->{'short_doc_file'};
868   
869    if ($self->{'gzip'}) {
870    my $doc_file = $self->{'gs_filename'};
871    `gzip $doc_file`;
872    $doc_file .= ".gz";
873    $short_doc_file .= ".gz";
874    if (!-e $doc_file) {
875         my $outhandle = $self->{'output_handle'};
876        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
877        return 0;
878    }
879    }
880
881    # store reference in output_info
882    my $output_info = $self->{'output_info'};
883    return 0 if (!defined $output_info);
884    $output_info->add_info($OID, $short_doc_file, undef, undef);
885    return 1;
886}
887
888
889#the subclass should implement this method if is_group method could return 1.
890sub close_group_output{
891   my $self = shift (@_);       
892}
893
894sub is_group {
895    my $self = shift (@_);
896    return 0;       
897}
898
899my $dc_set = { Title => 1,       
900           Creator => 1,
901           Subject => 1,
902           Description => 1,
903           Publisher => 1,
904           Contributor => 1,
905           Date => 1,
906           Type => 1,
907           Format => 1,
908           Identifier => 1,
909           Source => 1,
910           Language => 1,
911           Relation => 1,
912           Coverage => 1,
913           Rights => 1};
914
915
916# returns an XML representation of the dublin core metadata
917# if dc meta is not found, try ex mete
918sub get_dc_metadata {
919    my $self = shift(@_);
920    my ($doc_obj, $section, $version) = @_;
921   
922    # build up string of dublin core metadata
923    $section="" unless defined $section;
924   
925    my $section_ptr = $doc_obj->_lookup_section($section);
926    return "" unless defined $section_ptr;
927
928
929    my $explicit_dc = {};
930    my $explicit_ex = {};
931
932    my $all_text="";
933    foreach my $data (@{$section_ptr->{'metadata'}}){
934    my $escaped_value = &docprint::escape_text($data->[1]);
935    if ($data->[0]=~ m/^dc\./) {
936        $data->[0] =~ tr/[A-Z]/[a-z]/;
937
938        $data->[0] =~ m/^dc\.(.*)/;
939        my $dc_element =  $1;
940
941        if (!defined $explicit_dc->{$dc_element}) {
942        $explicit_dc->{$dc_element} = [];
943        }
944        push(@{$explicit_dc->{$dc_element}},$escaped_value);
945
946        if (defined $version && ($version eq "oai_dc")) {
947        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
948        }
949        else {
950        # qualifier???
951        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
952        }
953
954    }
955    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
956        $data->[0] =~ m/^(ex\.)?(.*)/;
957        my $ex_element =  $2;
958        my $lc_ex_element = lc($ex_element);
959
960        if (defined $dc_set->{$ex_element}) {
961        if (!defined $explicit_ex->{$lc_ex_element}) {
962            $explicit_ex->{$lc_ex_element} = [];
963        }
964        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
965        }
966    }
967    }
968
969    # go through dc_set and for any element *not* defined in explicit_dc
970    # that does exist in explicit_ex, add it in as metadata
971    foreach my $k ( keys %$dc_set ) {
972    my $lc_k = lc($k);
973
974    if (!defined $explicit_dc->{$lc_k}) {
975        if (defined $explicit_ex->{$lc_k}) {
976
977        foreach my $v (@{$explicit_ex->{$lc_k}}) {
978            my $dc_element    = $lc_k;
979            my $escaped_value = $v;
980
981            if (defined $version && ($version eq "oai_dc")) {
982            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
983            }
984            else {
985            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
986            }
987           
988        }
989        }
990    }
991    }
992
993    if ($all_text eq "") {
994    $all_text .= "   There is no Dublin Core metatdata in this document\n";
995    }   
996    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
997
998    return $all_text;
999}
1000
1001# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1002# This method was apparently added by Jeffrey and committed by Shaoqun.
1003# But we don't know why it was added, so not using it anymore.
1004sub new_get_dc_metadata {
1005   
1006    my $self = shift(@_);
1007    my ($doc_obj, $section, $version) = @_;
1008
1009    # build up string of dublin core metadata
1010    $section="" unless defined $section;
1011   
1012    my $section_ptr=$doc_obj->_lookup_section($section);
1013    return "" unless defined $section_ptr;
1014
1015    my $all_text = "";
1016    foreach my $data (@{$section_ptr->{'metadata'}}){
1017    my $escaped_value = &docprint::escape_text($data->[1]);
1018    my $dc_element =  $data->[0];
1019   
1020    my @array = split('\.',$dc_element);
1021    my ($type,$name);
1022
1023    if(defined $array[1])
1024    {
1025        $type = $array[0];
1026        $name = $array[1];
1027    }
1028    else
1029    {
1030        $type = "ex";
1031        $name = $array[0];
1032    }
1033   
1034    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1035    }
1036    return $all_text;
1037}
1038
1039
10401;
Note: See TracBrowser for help on using the browser.