root/gsdl/trunk/perllib/plugouts/BasePlugout.pm @ 20320

Revision 20320, 28.8 KB (checked in by kjdon, 11 years ago)

some changes to get MARCXML exporting working on windows

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use gsprintf 'gsprintf';
35use printusage;
36use parse2;
37use GDBMUtils;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60     'deft' => "",
61    'hiddengli' => "no"},
62       { 'name' => "output_handle",
63    'desc' => "{BasPlugout.output_handle}",
64    'type' => "string",
65        'deft' =>  'STDERR',
66    'reqd' => "no",
67    'hiddengli' => "yes"},
68       { 'name' => "verbosity",
69    'desc' => "{BasPlugout.verbosity}",
70    'type' => "int",
71        'deft' =>  "0",
72    'reqd' => "no", 
73        'hiddengli' => "no"},
74       { 'name' => "gzip_output",
75    'desc' => "{BasPlugout.gzip_output}",
76    'type' => "flag",
77    'reqd' => "no", 
78        'hiddengli' => "no"},
79       { 'name' => "debug",
80     'desc' => "{BasPlugout.debug}",
81     'type' => "flag",
82     'reqd' => "no",
83     'hiddengli' => "yes"}
84];
85
86my $options = { 'name'     => "BasePlugout",
87        'desc'     => "{BasPlugout.desc}",
88        'abstract' => "yes",
89        'inherits' => "no",
90        'args'     => $arguments};
91
92sub new
93{
94    my $class = shift (@_);
95
96    my ($plugoutlist,$args,$hashArgOptLists) = @_;
97    push(@$plugoutlist, $class);
98
99    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102    push(@{$hashArgOptLists->{"OptList"}},$options);
103
104    my $self = {};
105    $self->{'plugout_type'} = $class;
106    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107    $self->{"info_only"} = 0;
108
109    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110    # the args, just return the object. 
111    foreach my $strArg (@{$args})
112    {
113    if(defined $strArg && $strArg eq "-gsdlinfo")
114    {
115        $self->{"info_only"} = 1;
116        return bless $self, $class;
117    }
118    }
119   
120    delete $self->{"info_only"};
121   
122    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123    {
124    my $classTempClass = bless $self, $class;
125    print STDERR "<BadPlugout d=$plugout_name>\n";
126    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127    $classTempClass->print_txt_usage("");  # Use default resource bundle
128    die "\n";
129    }
130
131 
132    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
133    {
134    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135    if (!defined $full_file_path) {
136        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137        die "\n";
138    }
139    $self->{'xslt_file'} = $full_file_path;
140    }
141
142    $self->{'gs_count'} = 0;   
143
144    $self->{'keep_import_structure'} = 0;
145
146    return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152    my $self = shift(@_);
153    my $header = shift(@_);
154    my $high_level_information_only = shift(@_);
155   
156    # XML output is always in UTF-8
157    gsprintf::output_strings_in_UTF8;
158
159    if ($header) {
160    &PrintUsage::print_xml_header("plugout");
161    }
162    $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168    my $self = shift(@_);
169    my $high_level_information_only = shift(@_);
170
171    my $optionlistref = $self->{'option_list'};
172    my @optionlist = @$optionlistref;
173    my $plugoutoptions = shift(@$optionlistref);
174    return if (!defined($plugoutoptions));
175
176    gsprintf(STDERR, "<PlugoutInfo>\n");
177    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
178    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179    $desc =~ s/</&amp;lt;/g; # doubly escaped
180    $desc =~ s/>/&amp;gt;/g;
181    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
182    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184    unless (defined($high_level_information_only)) {
185    gsprintf(STDERR, "  <Arguments>\n");
186    if (defined($plugoutoptions->{'args'})) {
187        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188    }
189    gsprintf(STDERR, "  </Arguments>\n");
190
191    # Recurse up the plugout hierarchy
192    $self->print_xml();
193    }
194    gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200    my $self = shift(@_);
201
202    # Print the usage message for a plugout (recursively)
203    my $descoffset = $self->determine_description_offset(0);
204    $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209    my $self = shift(@_);
210    my $maxoffset = shift(@_);
211
212    my $optionlistref = $self->{'option_list'};
213    my @optionlist = @$optionlistref;
214    my $plugoutoptions = pop(@$optionlistref);
215    return $maxoffset if (!defined($plugoutoptions));
216
217    # Find the length of the longest option string of this download
218    my $plugoutargs = $plugoutoptions->{'args'};
219    if (defined($plugoutargs)) {
220    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221    if ($longest > $maxoffset) {
222        $maxoffset = $longest;
223    }
224    }
225
226    # Recurse up the download hierarchy
227    $maxoffset = $self->determine_description_offset($maxoffset);
228    $self->{'option_list'} = \@optionlist;
229    return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235    my $self = shift(@_);
236    my $descoffset = shift(@_);
237    my $isleafclass = shift(@_);
238
239    my $optionlistref = $self->{'option_list'};
240    my @optionlist = @$optionlistref;
241    my $plugoutoptions = shift(@$optionlistref);
242    return if (!defined($plugoutoptions));
243
244    my $plugoutname = $plugoutoptions->{'name'};
245    my $plugoutargs = $plugoutoptions->{'args'};
246    my $plugoutdesc = $plugoutoptions->{'desc'};
247
248    # Produce the usage information using the data structure above
249    if ($isleafclass) {
250    if (defined($plugoutdesc)) {
251        gsprintf(STDERR, "$plugoutdesc\n\n");
252    }
253    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254    }
255
256    # Display the download options, if there are some
257    if (defined($plugoutargs)) {
258    # Calculate the column offset of the option descriptions
259    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
260
261    if ($isleafclass) {
262        gsprintf(STDERR, " {common.specific_options}:\n");
263    }
264    else {
265        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266    }
267
268    # Display the download options
269    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270    }
271
272    # Recurse up the download hierarchy
273    $self->print_plugout_usage($descoffset, 0);
274    $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280      my ($strFunctionName,$strError) = @_;
281    {
282    print "Error occoured in BasePlugout.pm\n".
283        "In Function: ".$strFunctionName."\n".
284        "Error Message: ".$strError."\n";
285    exit(-1);
286    } 
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291    my $self = shift (@_);
292    my ($type, $metadata) = @_;
293
294    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295    $self->{'OIDtype'} = $type;
296    } else {
297    $self->{'OIDtype'} = "hash";
298    }
299    if ($type =~ /^assigned$/) {
300    if (defined $metadata) {
301        $self->{'OIDmetadata'} = $metadata;
302    } else {
303        $self->{'OIDmetadata'} = "dc.Identifier";
304    }
305    }
306}
307
308sub set_output_dir
309{
310    my $self = shift @_;
311    my ($output_dir) = @_;
312
313   $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318    my $self = shift @_;
319    my ($output_dir) = @_;
320
321    $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326    my $self = shift (@_);
327
328   return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333    my $self = shift (@_);
334
335    return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340    my $self = shift (@_);
341
342    return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348    my $self = shift (@_);
349
350    my ($output_file_name) = @_;
351     
352    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
353
354    return *OUTPUT;           
355}
356
357sub release_output_handler
358{
359    my $self = shift (@_);
360    my ($outhandler) = @_;
361
362    close($outhandler);
363
364}
365
366sub output_xml_header {
367    my $self = shift (@_);
368    my ($handle,$docroot,$nondoctype) = @_;
369   
370    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
371   
372    if (!defined $nondoctype){
373    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
374    }
375
376    print $handle "<$docroot>\n" if defined $docroot;
377}
378
379sub output_xml_footer {
380    my $self = shift (@_);
381    my ($handle,$docroot) = @_;
382    print $handle "</$docroot>\n" if defined $docroot;
383}
384
385sub process {
386    my $self = shift (@_);
387    my ($doc_obj) = @_;
388   
389    $doc_obj->set_lastmodified();
390
391     if ($self->{'group_size'} > 1) {
392     $self->group_process ($doc_obj);
393    return;
394    }
395
396    my $OID = $doc_obj->get_OID();
397    $OID = "NULL" unless defined $OID;     
398
399    my $top_section = $doc_obj->get_top_section();
400
401    #get document's directory
402    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
403   
404    my $output_info = $self->{'output_info'};
405    return if (!defined $output_info);
406     
407    ##############################
408    # call subclass' saveas method
409    ##############################
410    $self->saveas($doc_obj,$doc_dir);
411    $self->archiveinf_gdbm($doc_obj,$doc_dir);
412
413}
414
415sub store_output_info_reference {
416    my $self = shift (@_);
417    my ($doc_obj) = @_;
418
419    my $output_info = $self->{'output_info'};
420    my $metaname = $self->{'sortmeta'};
421    if (!defined $metaname || $metaname !~ /\S/) {
422    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
423    return;
424    }
425   
426    my $metadata = "";
427    my $top_section = $doc_obj->get_top_section();
428   
429    my @commameta_list = split(/,/, $metaname);
430    foreach my $cmn (@commameta_list) {
431    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
432    if ($meta) {
433        # do remove prefix/suffix - this will apply to all values
434        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
435        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
436        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
437        $metadata .= $meta if ($meta);
438    }
439    }
440
441    # store reference in the output_info     
442    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
443   
444}
445
446sub group_process {
447
448    my $self = shift (@_);
449    my ($doc_obj) = @_;
450   
451    my $OID = $doc_obj->get_OID();
452    $OID = "NULL" unless defined $OID;
453
454    my $groupsize = $self->{'group_size'};
455    my $gs_count = $self->{'gs_count'};
456    my $open_new_file = (($gs_count % $groupsize)==0);
457    my $outhandle = $self->{'output_handle'};
458
459    # opening a new file, or document has assoicated files => directory needed
460    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
461         
462        # The directory the archive file (doc.xml) and all associated files
463        # should end up in
464        my $doc_dir;
465        # If we've determined its time for a new file, open it now
466        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
467          {
468            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
469            # only if opening new file
470        my $output_dir = $self->get_output_dir();
471        &util::mk_all_dir ($output_dir) unless -e $output_dir;
472        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
473        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
474       
475        if ($gs_count>0)
476        {
477        return if (!$self->close_file_output());
478        }
479
480        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
481           
482
483        $self->{'gs_filename'} = $doc_file;
484        $self->{'short_doc_file'} = $short_doc_file;
485        $self->{'gs_OID'} = $OID;
486            $self->{'gs_doc_dir'} = $doc_dir;
487
488        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
489    }
490        # Otherwise load the same archive document directory used last time
491        else
492          {
493            $doc_dir = $self->{'gs_doc_dir'};
494          }
495
496    # copy all the associated files, add this information as metadata
497    # to the document
498        print $outhandle "Writing associated files to $doc_dir\n";
499    $self->process_assoc_files ($doc_obj, $doc_dir);
500
501    # look up 'gsdlmetafile' metadata and store that information
502    # explicitly in $doc_obj
503    $self->process_metafiles_metadata ($doc_obj);
504    }
505
506    # save this document
507    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
508    print GROUPPROCESS $section_text;
509
510    $self->{'gs_count'}++;
511}
512
513
514sub saveas {
515    my $self = shift (@_);
516   
517    die "Basplug::saveas function must be implemented in sub classes\n";
518}
519
520sub get_doc_dir {
521    my $self = shift (@_);
522    my ($OID, $source_filename) = @_;
523
524    my $working_dir  = $self->get_output_dir();
525    my $working_info = $self->{'output_info'};
526    return if (!defined $working_info);
527
528    my $doc_info = $working_info->get_info($OID);
529    my $doc_dir = '';
530
531    if (defined $doc_info && scalar(@$doc_info) >= 1)
532    {
533    # This OID already has an archives directory, so use it again
534    $doc_dir = $doc_info->[0];
535    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
536    }
537    elsif ($self->{'keep_import_structure'})
538    {
539    $source_filename = &File::Basename::dirname($source_filename);
540    $source_filename =~ s/[\\\/]+/\//g;
541    $source_filename =~ s/\/$//;
542
543        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
544    }
545
546    # We have to use a new archives directory for this document
547    if ($doc_dir eq "")
548    {
549    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
550    }
551
552    if (!defined $self->{'group'} || !$self->{'group'}){
553    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
554    }
555
556    return $doc_dir;
557}
558
559sub get_new_doc_dir{
560   my $self = shift (@_); 
561   my($working_info,$working_dir,$OID) = @_;     
562   
563   
564   my $doc_dir = "";
565   my $doc_dir_rest = $OID;
566   # remove any \ and / from the OID
567   $doc_dir_rest =~ s/[\\\/]//g;
568   my $doc_dir_num = 0;
569
570   do {
571       $doc_dir .= "/" if $doc_dir_num > 0;
572       if ($doc_dir_rest =~ s/^(.{1,8})//) {
573       $doc_dir .= $1;
574       $doc_dir_num++;
575       }
576   } while ($doc_dir_rest ne "" &&
577        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
578         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
579   my $i = 1;
580   my $doc_dir_base = $doc_dir;
581   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
582       $doc_dir = "$doc_dir_base-$i";
583       $i++;
584   }
585         
586   return "$doc_dir.dir";
587}
588
589sub process_assoc_files {
590    my $self = shift (@_);
591    my ($doc_obj, $doc_dir, $handle) = @_;
592
593    my $outhandle = $self->{'output_handle'};
594   
595    my $output_dir = $self->get_output_dir();
596    return if (!defined $output_dir);
597
598    &util::mk_all_dir ($output_dir) unless -e $output_dir;
599     
600    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
601    &util::mk_all_dir ($working_dir) unless -e $working_dir;
602
603    my @assoc_files = ();
604    my $filename;;
605
606    my $source_filename = $doc_obj->get_source_filename();
607
608    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
609
610    if (defined $collect_dir) {
611    my $dirsep_regexp = &util::get_os_dirsep();
612
613    if ($collect_dir !~ /$dirsep_regexp$/) {
614        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
615    }
616
617    # This test is never going to fail on Windows -- is this a problem?
618     
619    if ($source_filename !~ /^$dirsep_regexp/) {
620        $source_filename = &util::filename_cat($collect_dir, $source_filename);
621    }
622    }
623
624
625    # set the assocfile path (even if we have no assoc files - need this for lucene)
626    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
627                     "assocfilepath",
628                     "$doc_dir");
629    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
630    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
631    $dir = "" unless defined $dir;
632       
633   
634    my $real_filename = $assoc_file_rec->[0];
635    # for some reasons the image associate file has / before the full path
636    $real_filename =~ s/^\\(.*)/$1/i;
637    if (-e $real_filename) {
638
639        $filename = &util::filename_cat($working_dir, $afile);
640
641        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
642       
643        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
644                     "gsdlassocfile",
645                     "$afile:$assoc_file_rec->[2]:$dir");
646    } elsif ($self->{'verbosity'} > 2) {
647        print $outhandle "BasePlugout::process couldn't copy the associated file " .
648        "$real_filename to $afile\n";
649    }
650    }
651}
652
653
654sub process_metafiles_metadata
655{
656    my $self = shift (@_);
657    my ($doc_obj) = @_;
658
659    my $top_section = $doc_obj->get_top_section();
660    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
661
662    foreach my $metafile_pair (@$metafiles) {
663    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
664
665    $doc_obj->metadata_file($full_metafile,$metafile);
666    }
667
668    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
669}
670
671sub archiveinf_files_to_field
672{
673    my $self = shift(@_);
674    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
675
676    foreach my $file_rec (@$files) {
677    my $real_filename = $file_rec->[0];
678    my $full_file = $file_rec->[1];
679
680    # for some reasons the image associate file has / before the full path
681    $real_filename =~ s/^\\(.*)/$1/i;
682    if (-e $real_filename) {
683
684        if (defined $collect_dir) {
685        my $collect_dir_re_safe = $collect_dir;
686        $collect_dir_re_safe =~ s/\\/\\\\/g;
687        $collect_dir_re_safe =~ s/\./\\./g;
688
689        $real_filename =~ s/^$collect_dir_re_safe//;
690        }
691
692        $reverse_lookups->{$real_filename} = 1;
693
694        push(@{$oid_files->{$field}},$full_file);
695    }
696    else {
697        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
698    }
699    }
700}
701
702sub archiveinf_gdbm
703{
704    my $self = shift (@_);
705    my ($doc_obj) = @_;
706
707    my $verbosity = $self->{'verbosity'};
708
709    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
710    if (defined $collect_dir) {
711    my $dirsep_regexp = &util::get_os_dirsep();
712
713    if ($collect_dir !~ /$dirsep_regexp$/) {
714        # ensure there is a slash at the end
715        $collect_dir .= &util::get_dirsep();
716    }
717    }
718
719    my $oid = $doc_obj->get_OID();
720    my $source_filename = $doc_obj->get_unmodified_source_filename();
721
722    my $working_info = $self->{'output_info'};
723    my $doc_info = $working_info->get_info($oid);
724    my ($doc_file,$index_status) = @$doc_info;
725
726    my $oid_files = { 'doc-file' => $doc_file,
727              'index-status' => $index_status,
728              'src-file' => $source_filename,
729              'assoc-file' => [],
730              'meta-file'  => [] };
731   
732    my $reverse_lookups = { $source_filename => "1" };
733
734
735    $self->archiveinf_files_to_field($doc_obj->get_assoc_files(),"assoc-file",
736                     $collect_dir,$oid_files,$reverse_lookups);
737
738# *******
739#    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
740#   my $real_filename = $assoc_file_rec->[0];
741#   my $full_afile = $assoc_file_rec->[1];
742#
743#   # for some reasons the image associate file has / before the full path
744#   $real_filename =~ s/^\\(.*)/$1/i;
745#   if (-e $real_filename) {
746#
747#       if (defined $collect_dir) {
748#       my $collect_dir_re_safe = $collect_dir;
749#       $collect_dir_re_safe =~ s/\\/\\\\/g;
750#       $collect_dir_re_safe =~ s/\./\\./g;
751#
752#       $real_filename =~ s/^$collect_dir_re_safe//;
753#       }
754#
755#       $reverse_lookups->{$real_filename} = 1;
756#
757#       push(@{$oid_files->{'assoc-file'}},$full_afile);
758#   }
759#   else {
760#       print STDERR "Warning: archiveinf_gdbm()\n  $real_filename does not appear to be on the file system\n";
761#   }
762#    }
763
764    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
765                     $collect_dir,$oid_files,$reverse_lookups);
766
767
768    # better not to commit to a particular db implementation, but
769    # for simplicity, will use GDBM for now.
770
771    my $output_dir = $self->{'output_dir'};
772
773    my $doc_db = &util::filename_cat($output_dir,"archiveinf-doc.gdb");
774    my $src_db = &util::filename_cat($output_dir,"archiveinf-src.gdb");
775
776#    my $doc_db_text = "";
777#    $doc_db_text .= "<doc-file>$oid_files->{'doc-file'}\n";
778#    $doc_db_text .= "<index-status>$oid_files->{'index-status'}\n";
779#    $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n";
780
781#    foreach my $af (@{$oid_files->{'assoc-file'}}) {
782#   $doc_db_text .= "<assoc-file>$af\n";
783#    }
784
785#    foreach my $mf (@{$oid_files->{'meta-file'}}) {
786#   $doc_db_text .= "<meta-file>$mf\n";
787#    }
788
789#    chomp($doc_db_text); # remove trailing \n
790
791    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
792
793    ### &GDBMUtils::gdbmDatabaseSet($doc_db,$oid,$doc_db_text);
794
795    # switch to using GDBMUtils
796
797    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
798    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
799    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
800
801    my $infodb_file_handle
802    = &dbutil::open_infodb_write_handle_gdbm($doc_db,"append");
803    &dbutil::write_infodb_entry_gdbm($infodb_file_handle,$oid,$oid_files);
804    &dbutil::close_infodb_write_handle_gdbm($infodb_file_handle);
805
806
807
808    foreach my $rl (keys %$reverse_lookups) {
809    ## &GDBMUtils::gdbmDatabaseAppend($src_db,$rl,"<oid>$oid\n");   
810    $working_info->add_reverseinfo($rl,$oid);
811    }   
812}
813
814
815sub set_sortmeta {
816    my $self = shift (@_);
817    my ($sortmeta, $removeprefix, $removesuffix) = @_;
818   
819    $self->{'sortmeta'} = $sortmeta;
820    if (defined ($removeprefix) && $removeprefix ) {
821    $removeprefix =~ s/^\^//; # don't need a leading ^
822    $self->{'removeprefix'} = $removeprefix;
823    }
824    if (defined ($removesuffix) && $removesuffix) {
825    $removesuffix =~ s/\$$//; # don't need a trailing $
826    $self->{'removesuffix'} = $removesuffix;
827    }
828}
829
830sub open_xslt_pipe
831{
832    my $self = shift @_;
833    my ($output_file_name, $xslt_file)=@_;
834
835    return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
836   
837    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java");
838
839    my $mapping_file_path = "";
840
841    if ($ENV{'GSDLOS'} eq "windows"){
842    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
843    # this file:/// bit didn't work for me on windows XP
844    #$xslt_file = "\"file:///".$xslt_file."\"";
845    #$mapping_file_path = "\"file:///";
846    }
847    else{
848    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
849    }
850
851
852    $java_class_path = "\"".$java_class_path."\"";
853
854    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t $xslt_file ";
855
856    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
857    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
858    $cmd .= "-m $mapping_file_path";
859    }
860   
861    open(*XMLWRITER, $cmd)
862    or die "can't open pipe to xslt: $!";
863
864   
865    $self->{'xslt_writer'} = *XMLWRITER;
866
867    print XMLWRITER "<?DocStart?>\n";       
868    print XMLWRITER "$output_file_name\n";
869
870 
871  }
872 
873
874sub close_xslt_pipe
875{
876  my $self = shift @_;
877
878 
879  return unless defined $self->{'xslt_writer'} ;
880   
881  my $xsltwriter = $self->{'xslt_writer'};
882 
883  print $xsltwriter "<?DocEnd?>\n";
884  close($xsltwriter);
885
886  undef $self->{'xslt_writer'};
887
888}
889
890sub close_file_output
891{
892    my ($self) = @_;
893 
894    # make sure that the handle has been opened - it won't be if we failed
895    # to import any documents...
896    if (defined(fileno(GROUPPROCESS))) {
897    $self->output_xml_footer('GROUPPROCESS','Archive');   
898    close GROUPPROCESS;
899    }
900
901    my $OID = $self->{'gs_OID'};
902    my $short_doc_file = $self->{'short_doc_file'};
903   
904    if ($self->{'gzip'}) {
905    my $doc_file = $self->{'gs_filename'};
906    `gzip $doc_file`;
907    $doc_file .= ".gz";
908    $short_doc_file .= ".gz";
909    if (!-e $doc_file) {
910         my $outhandle = $self->{'output_handle'};
911        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
912        return 0;
913    }
914    }
915
916    # store reference in output_info
917    my $output_info = $self->{'output_info'};
918    return 0 if (!defined $output_info);
919    $output_info->add_info($OID, $short_doc_file, undef, undef);
920    return 1;
921}
922
923
924#the subclass should implement this method if is_group method could return 1.
925sub close_group_output{
926   my $self = shift (@_);       
927}
928
929sub is_group {
930    my $self = shift (@_);
931    return 0;       
932}
933
934my $dc_set = { Title => 1,       
935           Creator => 1,
936           Subject => 1,
937           Description => 1,
938           Publisher => 1,
939           Contributor => 1,
940           Date => 1,
941           Type => 1,
942           Format => 1,
943           Identifier => 1,
944           Source => 1,
945           Language => 1,
946           Relation => 1,
947           Coverage => 1,
948           Rights => 1};
949
950
951# returns an XML representation of the dublin core metadata
952# if dc meta is not found, try ex mete
953sub get_dc_metadata {
954    my $self = shift(@_);
955    my ($doc_obj, $section, $version) = @_;
956   
957    # build up string of dublin core metadata
958    $section="" unless defined $section;
959   
960    my $section_ptr = $doc_obj->_lookup_section($section);
961    return "" unless defined $section_ptr;
962
963
964    my $explicit_dc = {};
965    my $explicit_ex = {};
966
967    my $all_text="";
968    foreach my $data (@{$section_ptr->{'metadata'}}){
969    my $escaped_value = &docprint::escape_text($data->[1]);
970    if ($data->[0]=~ m/^dc\./) {
971        $data->[0] =~ tr/[A-Z]/[a-z]/;
972
973        $data->[0] =~ m/^dc\.(.*)/;
974        my $dc_element =  $1;
975
976        if (!defined $explicit_dc->{$dc_element}) {
977        $explicit_dc->{$dc_element} = [];
978        }
979        push(@{$explicit_dc->{$dc_element}},$escaped_value);
980
981        if (defined $version && ($version eq "oai_dc")) {
982        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
983        }
984        else {
985        # qualifier???
986        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
987        }
988
989    }
990    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
991        $data->[0] =~ m/^(ex\.)?(.*)/;
992        my $ex_element =  $2;
993        my $lc_ex_element = lc($ex_element);
994
995        if (defined $dc_set->{$ex_element}) {
996        if (!defined $explicit_ex->{$lc_ex_element}) {
997            $explicit_ex->{$lc_ex_element} = [];
998        }
999        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1000        }
1001    }
1002    }
1003
1004    # go through dc_set and for any element *not* defined in explicit_dc
1005    # that does exist in explicit_ex, add it in as metadata
1006    foreach my $k ( keys %$dc_set ) {
1007    my $lc_k = lc($k);
1008
1009    if (!defined $explicit_dc->{$lc_k}) {
1010        if (defined $explicit_ex->{$lc_k}) {
1011
1012        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1013            my $dc_element    = $lc_k;
1014            my $escaped_value = $v;
1015
1016            if (defined $version && ($version eq "oai_dc")) {
1017            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1018            }
1019            else {
1020            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1021            }
1022           
1023        }
1024        }
1025    }
1026    }
1027
1028    if ($all_text eq "") {
1029    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1030    }   
1031    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1032
1033    return $all_text;
1034}
1035
1036# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1037# This method was apparently added by Jeffrey and committed by Shaoqun.
1038# But we don't know why it was added, so not using it anymore.
1039sub new_get_dc_metadata {
1040   
1041    my $self = shift(@_);
1042    my ($doc_obj, $section, $version) = @_;
1043
1044    # build up string of dublin core metadata
1045    $section="" unless defined $section;
1046   
1047    my $section_ptr=$doc_obj->_lookup_section($section);
1048    return "" unless defined $section_ptr;
1049
1050    my $all_text = "";
1051    foreach my $data (@{$section_ptr->{'metadata'}}){
1052    my $escaped_value = &docprint::escape_text($data->[1]);
1053    my $dc_element =  $data->[0];
1054   
1055    my @array = split('\.',$dc_element);
1056    my ($type,$name);
1057
1058    if(defined $array[1])
1059    {
1060        $type = $array[0];
1061        $name = $array[1];
1062    }
1063    else
1064    {
1065        $type = "ex";
1066        $name = $array[0];
1067    }
1068   
1069    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1070    }
1071    return $all_text;
1072}
1073
1074
10751;
Note: See TracBrowser for help on using the browser.