root/gsdl/trunk/perllib/plugouts/BasePlugout.pm @ 19829

Revision 19829, 28.6 KB (checked in by davidb, 11 years ago)

doc.pm API extended to include call for finding out the original source filename (rather than the one where the rename_method has been applied). Useful for incremental building, and probably other things too

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use gsprintf 'gsprintf';
35use printusage;
36use parse2;
37use GDBMUtils;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60    'hiddengli' => "no"},
61       { 'name' => "output_handle",
62    'desc' => "{BasPlugout.output_handle}",
63    'type' => "string",
64        'deft' =>  'STDERR',
65    'reqd' => "no",
66    'hiddengli' => "yes"},
67       { 'name' => "verbosity",
68    'desc' => "{BasPlugout.verbosity}",
69    'type' => "int",
70        'deft' =>  "0",
71    'reqd' => "no", 
72        'hiddengli' => "no"},
73       { 'name' => "gzip_output",
74    'desc' => "{BasPlugout.gzip_output}",
75    'type' => "flag",
76    'reqd' => "no", 
77        'hiddengli' => "no"},
78       { 'name' => "debug",
79     'desc' => "{BasPlugout.debug}",
80     'type' => "flag",
81     'reqd' => "no",
82     'hiddengli' => "yes"}
83];
84
85my $options = { 'name'     => "BasePlugout",
86        'desc'     => "{BasPlugout.desc}",
87        'abstract' => "yes",
88        'inherits' => "no",
89        'args'     => $arguments};
90
91sub new
92{
93    my $class = shift (@_);
94
95    my ($plugoutlist,$args,$hashArgOptLists) = @_;
96    push(@$plugoutlist, $class);
97
98    my $strPlugoutName = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
99
100    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
101    push(@{$hashArgOptLists->{"OptList"}},$options);
102
103    my $self = {};
104    $self->{'plugout_type'} = $class;
105    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
106    $self->{"info_only"} = 0;
107
108    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
109    # the args, just return the object. 
110    foreach my $strArg (@{$args})
111    {
112    if(defined $strArg && $strArg eq "-gsdlinfo")
113    {
114        $self->{"info_only"} = 1;
115        return bless $self, $class;
116    }
117    }
118   
119    delete $self->{"info_only"};
120   
121    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
122    {
123    my $classTempClass = bless $self, $class;
124    print STDERR "<BadPlugout d=$self->{'plugout_name'}>\n";
125    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $self->{'plugout_name'});
126    $classTempClass->print_txt_usage("");  # Use default resource bundle
127    die "\n";
128    }
129
130 
131    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
132    {
133      ##$self->{'xslt_file'} =~ s/\"//g;##working on Windows???
134    print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n" and  die "\n" unless (-e $self->{'xslt_file'});
135    }
136
137    $self->{'gs_count'} = 0;   
138
139    $self->{'keep_import_structure'} = 0;
140
141    return bless $self, $class;
142
143}
144
145sub print_xml_usage
146{
147    my $self = shift(@_);
148    my $header = shift(@_);
149    my $high_level_information_only = shift(@_);
150   
151    # XML output is always in UTF-8
152    gsprintf::output_strings_in_UTF8;
153
154    if ($header) {
155    &PrintUsage::print_xml_header("plugout");
156    }
157    $self->print_xml($high_level_information_only);
158}
159
160
161sub print_xml
162{
163    my $self = shift(@_);
164    my $high_level_information_only = shift(@_);
165
166    my $optionlistref = $self->{'option_list'};
167    my @optionlist = @$optionlistref;
168    my $plugoutoptions = shift(@$optionlistref);
169    return if (!defined($plugoutoptions));
170
171    gsprintf(STDERR, "<PlugoutInfo>\n");
172    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
173    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
174    $desc =~ s/</&amp;lt;/g; # doubly escaped
175    $desc =~ s/>/&amp;gt;/g;
176    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
177    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
178    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
179    unless (defined($high_level_information_only)) {
180    gsprintf(STDERR, "  <Arguments>\n");
181    if (defined($plugoutoptions->{'args'})) {
182        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
183    }
184    gsprintf(STDERR, "  </Arguments>\n");
185
186    # Recurse up the plugout hierarchy
187    $self->print_xml();
188    }
189    gsprintf(STDERR, "</PlugoutInfo>\n");
190}
191
192
193sub print_txt_usage
194{
195    my $self = shift(@_);
196
197    # Print the usage message for a plugout (recursively)
198    my $descoffset = $self->determine_description_offset(0);
199    $self->print_plugout_usage($descoffset, 1);
200}
201
202sub determine_description_offset
203{
204    my $self = shift(@_);
205    my $maxoffset = shift(@_);
206
207    my $optionlistref = $self->{'option_list'};
208    my @optionlist = @$optionlistref;
209    my $plugoutoptions = pop(@$optionlistref);
210    return $maxoffset if (!defined($plugoutoptions));
211
212    # Find the length of the longest option string of this download
213    my $plugoutargs = $plugoutoptions->{'args'};
214    if (defined($plugoutargs)) {
215    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
216    if ($longest > $maxoffset) {
217        $maxoffset = $longest;
218    }
219    }
220
221    # Recurse up the download hierarchy
222    $maxoffset = $self->determine_description_offset($maxoffset);
223    $self->{'option_list'} = \@optionlist;
224    return $maxoffset;
225}
226
227
228sub print_plugout_usage
229{
230    my $self = shift(@_);
231    my $descoffset = shift(@_);
232    my $isleafclass = shift(@_);
233
234    my $optionlistref = $self->{'option_list'};
235    my @optionlist = @$optionlistref;
236    my $plugoutoptions = shift(@$optionlistref);
237    return if (!defined($plugoutoptions));
238
239    my $plugoutname = $plugoutoptions->{'name'};
240    my $plugoutargs = $plugoutoptions->{'args'};
241    my $plugoutdesc = $plugoutoptions->{'desc'};
242
243    # Produce the usage information using the data structure above
244    if ($isleafclass) {
245    if (defined($plugoutdesc)) {
246        gsprintf(STDERR, "$plugoutdesc\n\n");
247    }
248    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
249    }
250
251    # Display the download options, if there are some
252    if (defined($plugoutargs)) {
253    # Calculate the column offset of the option descriptions
254    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
255
256    if ($isleafclass) {
257        gsprintf(STDERR, " {common.specific_options}:\n");
258    }
259    else {
260        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
261    }
262
263    # Display the download options
264    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
265    }
266
267    # Recurse up the download hierarchy
268    $self->print_plugout_usage($descoffset, 0);
269    $self->{'option_list'} = \@optionlist;
270}
271
272
273sub error
274{
275      my ($strFunctionName,$strError) = @_;
276    {
277    print "Error occoured in BasePlugout.pm\n".
278        "In Function: ".$strFunctionName."\n".
279        "Error Message: ".$strError."\n";
280    exit(-1);
281    } 
282}
283
284# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
285sub set_OIDtype {
286    my $self = shift (@_);
287    my ($type, $metadata) = @_;
288
289    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
290    $self->{'OIDtype'} = $type;
291    } else {
292    $self->{'OIDtype'} = "hash";
293    }
294    if ($type =~ /^assigned$/) {
295    if (defined $metadata) {
296        $self->{'OIDmetadata'} = $metadata;
297    } else {
298        $self->{'OIDmetadata'} = "dc.Identifier";
299    }
300    }
301}
302
303sub set_output_dir
304{
305    my $self = shift @_;
306    my ($output_dir) = @_;
307
308   $self->{'output_dir'} = $output_dir;
309}
310
311sub setoutputdir
312{
313    my $self = shift @_;
314    my ($output_dir) = @_;
315
316    $self->{'output_dir'} = $output_dir;
317}
318
319sub get_output_dir
320{
321    my $self = shift (@_);
322
323   return $self->{'output_dir'};
324}
325
326sub getoutputdir
327{
328    my $self = shift (@_);
329
330    return $self->{'output_dir'};
331}
332
333sub getoutputinfo
334{
335    my $self = shift (@_);
336
337    return $self->{'output_info'};
338}
339
340
341sub get_output_handler
342{
343    my $self = shift (@_);
344
345    my ($output_file_name) = @_;
346     
347    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
348
349    return *OUTPUT;           
350}
351
352sub release_output_handler
353{
354    my $self = shift (@_);
355    my ($outhandler) = @_;
356
357    close($outhandler);
358
359}
360
361sub output_xml_header {
362    my $self = shift (@_);
363    my ($handle,$docroot,$nondoctype) = @_;
364   
365    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
366   
367    if (!defined $nondoctype){
368    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
369    }
370
371    print $handle "<$docroot>\n" if defined $docroot;
372}
373
374sub output_xml_footer {
375    my $self = shift (@_);
376    my ($handle,$docroot) = @_;
377    print $handle "</$docroot>\n" if defined $docroot;
378}
379
380sub process {
381    my $self = shift (@_);
382    my ($doc_obj) = @_;
383   
384    $doc_obj->set_lastmodified();
385
386     if ($self->{'group_size'} > 1) {
387     $self->group_process ($doc_obj);
388    return;
389    }
390
391    my $OID = $doc_obj->get_OID();
392    $OID = "NULL" unless defined $OID;     
393
394    my $top_section = $doc_obj->get_top_section();
395
396    #get document's directory
397    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
398   
399    my $output_info = $self->{'output_info'};
400    return if (!defined $output_info);
401     
402    ##############################
403    # call subclass' saveas method
404    ##############################
405    $self->saveas($doc_obj,$doc_dir);
406    $self->archiveinf_gdbm($doc_obj,$doc_dir);
407
408}
409
410sub store_output_info_reference {
411    my $self = shift (@_);
412    my ($doc_obj) = @_;
413
414    my $output_info = $self->{'output_info'};
415    my $metaname = $self->{'sortmeta'};
416    if (!defined $metaname || $metaname !~ /\S/) {
417    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
418    return;
419    }
420   
421    my $metadata = "";
422    my $top_section = $doc_obj->get_top_section();
423   
424    my @commameta_list = split(/,/, $metaname);
425    foreach my $cmn (@commameta_list) {
426    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
427    if ($meta) {
428        # do remove prefix/suffix - this will apply to all values
429        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
430        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
431        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
432        $metadata .= $meta if ($meta);
433    }
434    }
435
436    # store reference in the output_info     
437    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
438   
439}
440
441sub group_process {
442
443    my $self = shift (@_);
444    my ($doc_obj) = @_;
445   
446    my $OID = $doc_obj->get_OID();
447    $OID = "NULL" unless defined $OID;
448
449    my $groupsize = $self->{'group_size'};
450    my $gs_count = $self->{'gs_count'};
451    my $open_new_file = (($gs_count % $groupsize)==0);
452    my $outhandle = $self->{'output_handle'};
453
454    # opening a new file, or document has assoicated files => directory needed
455    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
456         
457        # The directory the archive file (doc.xml) and all associated files
458        # should end up in
459        my $doc_dir;
460        # If we've determined its time for a new file, open it now
461        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
462          {
463            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
464            # only if opening new file
465        my $output_dir = $self->get_output_dir();
466        &util::mk_all_dir ($output_dir) unless -e $output_dir;
467        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
468        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
469       
470        if ($gs_count>0)
471        {
472        return if (!$self->close_file_output());
473        }
474
475        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
476           
477
478        $self->{'gs_filename'} = $doc_file;
479        $self->{'short_doc_file'} = $short_doc_file;
480        $self->{'gs_OID'} = $OID;
481            $self->{'gs_doc_dir'} = $doc_dir;
482
483        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
484    }
485        # Otherwise load the same archive document directory used last time
486        else
487          {
488            $doc_dir = $self->{'gs_doc_dir'};
489          }
490
491    # copy all the associated files, add this information as metadata
492    # to the document
493        print $outhandle "Writing associated files to $doc_dir\n";
494    $self->process_assoc_files ($doc_obj, $doc_dir);
495
496    # look up 'gsdlmetafile' metadata and store that information
497    # explicitly in $doc_obj
498    $self->process_metafiles_metadata ($doc_obj);
499    }
500
501    # save this document
502    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
503    print GROUPPROCESS $section_text;
504
505    $self->{'gs_count'}++;
506}
507
508
509sub saveas {
510    my $self = shift (@_);
511   
512    die "Basplug::saveas function must be implemented in sub classes\n";
513}
514
515sub get_doc_dir {
516    my $self = shift (@_);
517    my ($OID, $source_filename) = @_;
518
519    my $working_dir  = $self->get_output_dir();
520    my $working_info = $self->{'output_info'};
521    return if (!defined $working_info);
522
523    my $doc_info = $working_info->get_info($OID);
524    my $doc_dir = '';
525
526    if (defined $doc_info && scalar(@$doc_info) >= 1)
527    {
528    # This OID already has an archives directory, so use it again
529    $doc_dir = $doc_info->[0];
530    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
531    }
532    elsif ($self->{'keep_import_structure'})
533    {
534    $source_filename = &File::Basename::dirname($source_filename);
535    $source_filename =~ s/[\\\/]+/\//g;
536    $source_filename =~ s/\/$//;
537
538        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
539    }
540
541    # We have to use a new archives directory for this document
542    if ($doc_dir eq "")
543    {
544    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
545    }
546
547    if (!defined $self->{'group'} || !$self->{'group'}){
548    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
549    }
550
551    return $doc_dir;
552}
553
554sub get_new_doc_dir{
555   my $self = shift (@_); 
556   my($working_info,$working_dir,$OID) = @_;     
557   
558   
559   my $doc_dir = "";
560   my $doc_dir_rest = $OID;
561   # remove any \ and / from the OID
562   $doc_dir_rest =~ s/[\\\/]//g;
563   my $doc_dir_num = 0;
564
565   do {
566       $doc_dir .= "/" if $doc_dir_num > 0;
567       if ($doc_dir_rest =~ s/^(.{1,8})//) {
568       $doc_dir .= $1;
569       $doc_dir_num++;
570       }
571   } while ($doc_dir_rest ne "" &&
572        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
573         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
574   my $i = 1;
575   my $doc_dir_base = $doc_dir;
576   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
577       $doc_dir = "$doc_dir_base-$i";
578       $i++;
579   }
580         
581   return "$doc_dir.dir";
582}
583
584sub process_assoc_files {
585    my $self = shift (@_);
586    my ($doc_obj, $doc_dir, $handle) = @_;
587
588    my $outhandle = $self->{'output_handle'};
589   
590    my $output_dir = $self->get_output_dir();
591    return if (!defined $output_dir);
592
593    &util::mk_all_dir ($output_dir) unless -e $output_dir;
594     
595    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
596    &util::mk_all_dir ($working_dir) unless -e $working_dir;
597
598    my @assoc_files = ();
599    my $filename;;
600
601    my $source_filename = $doc_obj->get_source_filename();
602
603    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
604
605    if (defined $collect_dir) {
606    my $dirsep_regexp = &util::get_os_dirsep();
607
608    if ($collect_dir !~ /$dirsep_regexp$/) {
609        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
610    }
611
612    # This test is never going to fail on Windows -- is this a problem?
613     
614    if ($source_filename !~ /^$dirsep_regexp/) {
615        $source_filename = &util::filename_cat($collect_dir, $source_filename);
616    }
617    }
618
619
620    # set the assocfile path (even if we have no assoc files - need this for lucene)
621    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
622                     "assocfilepath",
623                     "$doc_dir");
624    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
625    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
626    $dir = "" unless defined $dir;
627       
628   
629    my $real_filename = $assoc_file_rec->[0];
630    # for some reasons the image associate file has / before the full path
631    $real_filename =~ s/^\\(.*)/$1/i;
632    if (-e $real_filename) {
633
634        $filename = &util::filename_cat($working_dir, $afile);
635
636        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
637       
638        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
639                     "gsdlassocfile",
640                     "$afile:$assoc_file_rec->[2]:$dir");
641    } elsif ($self->{'verbosity'} > 2) {
642        print $outhandle "BasePlugout::process couldn't copy the associated file " .
643        "$real_filename to $afile\n";
644    }
645    }
646}
647
648
649sub process_metafiles_metadata
650{
651    my $self = shift (@_);
652    my ($doc_obj) = @_;
653
654    my $top_section = $doc_obj->get_top_section();
655    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
656
657    foreach my $metafile_pair (@$metafiles) {
658    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
659
660    $doc_obj->metadata_file($full_metafile,$metafile);
661    }
662
663    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
664}
665
666sub archiveinf_files_to_field
667{
668    my $self = shift(@_);
669    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
670
671    foreach my $file_rec (@$files) {
672    my $real_filename = $file_rec->[0];
673    my $full_file = $file_rec->[1];
674
675    # for some reasons the image associate file has / before the full path
676    $real_filename =~ s/^\\(.*)/$1/i;
677    if (-e $real_filename) {
678
679        if (defined $collect_dir) {
680        my $collect_dir_re_safe = $collect_dir;
681        $collect_dir_re_safe =~ s/\\/\\\\/g;
682        $collect_dir_re_safe =~ s/\./\\./g;
683
684        $real_filename =~ s/^$collect_dir_re_safe//;
685        }
686
687        $reverse_lookups->{$real_filename} = 1;
688
689        push(@{$oid_files->{$field}},$full_file);
690    }
691    else {
692        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
693    }
694    }
695}
696
697sub archiveinf_gdbm
698{
699    my $self = shift (@_);
700    my ($doc_obj) = @_;
701
702    my $verbosity = $self->{'verbosity'};
703
704    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
705    if (defined $collect_dir) {
706    my $dirsep_regexp = &util::get_os_dirsep();
707
708    if ($collect_dir !~ /$dirsep_regexp$/) {
709        # ensure there is a slash at the end
710        $collect_dir .= &util::get_dirsep();
711    }
712    }
713
714    my $oid = $doc_obj->get_OID();
715    my $source_filename = $doc_obj->get_unmodified_source_filename();
716
717    my $working_info = $self->{'output_info'};
718    my $doc_info = $working_info->get_info($oid);
719    my ($doc_file,$index_status) = @$doc_info;
720
721    my $oid_files = { 'doc-file' => $doc_file,
722              'index-status' => $index_status,
723              'src-file' => $source_filename,
724              'assoc-file' => [],
725              'meta-file'  => [] };
726   
727    my $reverse_lookups = { $source_filename => "1" };
728
729
730    $self->archiveinf_files_to_field($doc_obj->get_assoc_files(),"assoc-file",
731                     $collect_dir,$oid_files,$reverse_lookups);
732
733# *******
734#    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
735#   my $real_filename = $assoc_file_rec->[0];
736#   my $full_afile = $assoc_file_rec->[1];
737#
738#   # for some reasons the image associate file has / before the full path
739#   $real_filename =~ s/^\\(.*)/$1/i;
740#   if (-e $real_filename) {
741#
742#       if (defined $collect_dir) {
743#       my $collect_dir_re_safe = $collect_dir;
744#       $collect_dir_re_safe =~ s/\\/\\\\/g;
745#       $collect_dir_re_safe =~ s/\./\\./g;
746#
747#       $real_filename =~ s/^$collect_dir_re_safe//;
748#       }
749#
750#       $reverse_lookups->{$real_filename} = 1;
751#
752#       push(@{$oid_files->{'assoc-file'}},$full_afile);
753#   }
754#   else {
755#       print STDERR "Warning: archiveinf_gdbm()\n  $real_filename does not appear to be on the file system\n";
756#   }
757#    }
758
759    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
760                     $collect_dir,$oid_files,$reverse_lookups);
761
762
763    # better not to commit to a particular db implementation, but
764    # for simplicity, will use GDBM for now.
765
766    my $output_dir = $self->{'output_dir'};
767
768    my $doc_db = &util::filename_cat($output_dir,"archiveinf-doc.gdb");
769    my $src_db = &util::filename_cat($output_dir,"archiveinf-src.gdb");
770
771#    my $doc_db_text = "";
772#    $doc_db_text .= "<doc-file>$oid_files->{'doc-file'}\n";
773#    $doc_db_text .= "<index-status>$oid_files->{'index-status'}\n";
774#    $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n";
775
776#    foreach my $af (@{$oid_files->{'assoc-file'}}) {
777#   $doc_db_text .= "<assoc-file>$af\n";
778#    }
779
780#    foreach my $mf (@{$oid_files->{'meta-file'}}) {
781#   $doc_db_text .= "<meta-file>$mf\n";
782#    }
783
784#    chomp($doc_db_text); # remove trailing \n
785
786    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
787
788    ### &GDBMUtils::gdbmDatabaseSet($doc_db,$oid,$doc_db_text);
789
790    # switch to using GDBMUtils
791
792    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
793    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
794    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
795
796    my $infodb_file_handle
797    = &dbutil::open_infodb_write_handle_gdbm($doc_db,"append");
798    &dbutil::write_infodb_entry_gdbm($infodb_file_handle,$oid,$oid_files);
799    &dbutil::close_infodb_write_handle_gdbm($infodb_file_handle);
800
801
802
803    foreach my $rl (keys %$reverse_lookups) {
804    ## &GDBMUtils::gdbmDatabaseAppend($src_db,$rl,"<oid>$oid\n");   
805    $working_info->add_reverseinfo($rl,$oid);
806    }   
807}
808
809
810sub set_sortmeta {
811    my $self = shift (@_);
812    my ($sortmeta, $removeprefix, $removesuffix) = @_;
813   
814    $self->{'sortmeta'} = $sortmeta;
815    if (defined ($removeprefix) && $removeprefix ) {
816    $removeprefix =~ s/^\^//; # don't need a leading ^
817    $self->{'removeprefix'} = $removeprefix;
818    }
819    if (defined ($removesuffix) && $removesuffix) {
820    $removesuffix =~ s/\$$//; # don't need a trailing $
821    $self->{'removesuffix'} = $removesuffix;
822    }
823}
824
825sub open_xslt_pipe
826{
827    my $self = shift @_;
828    my ($output_file_name, $xslt_file)=@_;
829
830   return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
831
832    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java");
833
834    my $mapping_file_path = "";
835
836    if ($ENV{'GSDLOS'} eq "windows"){
837    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
838    $xslt_file = "\"file:///".$xslt_file."\"";
839    $mapping_file_path = "\"file:///";
840    }
841    else{
842    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
843    }
844
845
846    $java_class_path = "\"".$java_class_path."\"";
847
848    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t $xslt_file ";
849
850    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
851    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
852    $cmd .= "-m $mapping_file_path";
853    }
854
855    open(*XMLWRITER, $cmd)
856    or die "can't open pipe to xslt: $!";
857
858   
859    $self->{'xslt_writer'} = *XMLWRITER;
860
861    print XMLWRITER "<?DocStart?>\n";       
862    print XMLWRITER "$output_file_name\n";
863
864 
865  }
866 
867
868sub close_xslt_pipe
869{
870  my $self = shift @_;
871
872 
873  return unless defined $self->{'xslt_writer'} ;
874   
875  my $xsltwriter = $self->{'xslt_writer'};
876 
877  print $xsltwriter "<?DocEnd?>\n";
878  close($xsltwriter);
879
880  undef $self->{'xslt_writer'};
881
882}
883
884sub close_file_output
885{
886    my ($self) = @_;
887 
888    # make sure that the handle has been opened - it won't be if we failed
889    # to import any documents...
890    if (defined(fileno(GROUPPROCESS))) {
891    $self->output_xml_footer('GROUPPROCESS','Archive');   
892    close GROUPPROCESS;
893    }
894
895    my $OID = $self->{'gs_OID'};
896    my $short_doc_file = $self->{'short_doc_file'};
897   
898    if ($self->{'gzip'}) {
899    my $doc_file = $self->{'gs_filename'};
900    `gzip $doc_file`;
901    $doc_file .= ".gz";
902    $short_doc_file .= ".gz";
903    if (!-e $doc_file) {
904         my $outhandle = $self->{'output_handle'};
905        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
906        return 0;
907    }
908    }
909
910    # store reference in output_info
911    my $output_info = $self->{'output_info'};
912    return 0 if (!defined $output_info);
913    $output_info->add_info($OID, $short_doc_file, undef, undef);
914    return 1;
915}
916
917
918#the subclass should implement this method if is_group method could return 1.
919sub close_group_output{
920   my $self = shift (@_);       
921}
922
923sub is_group {
924    my $self = shift (@_);
925    return 0;       
926}
927
928my $dc_set = { Title => 1,       
929           Creator => 1,
930           Subject => 1,
931           Description => 1,
932           Publisher => 1,
933           Contributor => 1,
934           Date => 1,
935           Type => 1,
936           Format => 1,
937           Identifier => 1,
938           Source => 1,
939           Language => 1,
940           Relation => 1,
941           Coverage => 1,
942           Rights => 1};
943
944
945# returns an XML representation of the dublin core metadata
946# if dc meta is not found, try ex mete
947sub get_dc_metadata {
948    my $self = shift(@_);
949    my ($doc_obj, $section, $version) = @_;
950   
951    # build up string of dublin core metadata
952    $section="" unless defined $section;
953   
954    my $section_ptr = $doc_obj->_lookup_section($section);
955    return "" unless defined $section_ptr;
956
957
958    my $explicit_dc = {};
959    my $explicit_ex = {};
960
961    my $all_text="";
962    foreach my $data (@{$section_ptr->{'metadata'}}){
963    my $escaped_value = &docprint::escape_text($data->[1]);
964    if ($data->[0]=~ m/^dc\./) {
965        $data->[0] =~ tr/[A-Z]/[a-z]/;
966
967        $data->[0] =~ m/^dc\.(.*)/;
968        my $dc_element =  $1;
969
970        if (!defined $explicit_dc->{$dc_element}) {
971        $explicit_dc->{$dc_element} = [];
972        }
973        push(@{$explicit_dc->{$dc_element}},$escaped_value);
974
975        if (defined $version && ($version eq "oai_dc")) {
976        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
977        }
978        else {
979        # qualifier???
980        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
981        }
982
983    }
984    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
985        $data->[0] =~ m/^(ex\.)?(.*)/;
986        my $ex_element =  $2;
987        my $lc_ex_element = lc($ex_element);
988
989        if (defined $dc_set->{$ex_element}) {
990        if (!defined $explicit_ex->{$lc_ex_element}) {
991            $explicit_ex->{$lc_ex_element} = [];
992        }
993        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
994        }
995    }
996    }
997
998    # go through dc_set and for any element *not* defined in explicit_dc
999    # that does exist in explicit_ex, add it in as metadata
1000    foreach my $k ( keys %$dc_set ) {
1001    my $lc_k = lc($k);
1002
1003    if (!defined $explicit_dc->{$lc_k}) {
1004        if (defined $explicit_ex->{$lc_k}) {
1005
1006        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1007            my $dc_element    = $lc_k;
1008            my $escaped_value = $v;
1009
1010            if (defined $version && ($version eq "oai_dc")) {
1011            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1012            }
1013            else {
1014            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1015            }
1016           
1017        }
1018        }
1019    }
1020    }
1021
1022    if ($all_text eq "") {
1023    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1024    }   
1025    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1026
1027    return $all_text;
1028}
1029
1030# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1031# This method was apparently added by Jeffrey and committed by Shaoqun.
1032# But we don't know why it was added, so not using it anymore.
1033sub new_get_dc_metadata {
1034   
1035    my $self = shift(@_);
1036    my ($doc_obj, $section, $version) = @_;
1037
1038    # build up string of dublin core metadata
1039    $section="" unless defined $section;
1040   
1041    my $section_ptr=$doc_obj->_lookup_section($section);
1042    return "" unless defined $section_ptr;
1043
1044    my $all_text = "";
1045    foreach my $data (@{$section_ptr->{'metadata'}}){
1046    my $escaped_value = &docprint::escape_text($data->[1]);
1047    my $dc_element =  $data->[0];
1048   
1049    my @array = split('\.',$dc_element);
1050    my ($type,$name);
1051
1052    if(defined $array[1])
1053    {
1054        $type = $array[0];
1055        $name = $array[1];
1056    }
1057    else
1058    {
1059        $type = "ex";
1060        $name = $array[0];
1061    }
1062   
1063    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1064    }
1065    return $all_text;
1066}
1067
1068
10691;
Note: See TracBrowser for help on using the browser.