root/gsdl/trunk/perllib/plugouts/BasePlugout.pm @ 19180

Revision 19180, 26.3 KB (checked in by kjdon, 11 years ago)

remove any slashes from OID when generating an archive dir, otherwise may get different directory strucure between windows and linux and not sure that it would work properly like that

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use gsprintf 'gsprintf';
35use printusage;
36use parse2;
37use GDBMUtils;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45       { 'name' => "group_size",
46    'desc' => "{BasPlugout.group_size}",
47    'type' => "int",
48        'deft' =>  "1",
49    'reqd' => "no",
50    'hiddengli' => "no"},
51       { 'name' => "output_info",
52    'desc' => "{BasPlugout.output_info}",
53    'type' => "string",   
54    'reqd' => "yes",
55    'hiddengli' => "yes"},       
56       { 'name' => "xslt_file",
57    'desc' => "{BasPlugout.xslt_file}",
58    'type' => "string",
59    'reqd' => "no",
60    'hiddengli' => "no"},
61       { 'name' => "output_handle",
62    'desc' => "{BasPlugout.output_handle}",
63    'type' => "string",
64        'deft' =>  'STDERR',
65    'reqd' => "no",
66    'hiddengli' => "yes"},
67       { 'name' => "verbosity",
68    'desc' => "{BasPlugout.verbosity}",
69    'type' => "int",
70        'deft' =>  "0",
71    'reqd' => "no", 
72        'hiddengli' => "no"},
73       { 'name' => "gzip_output",
74    'desc' => "{BasPlugout.gzip_output}",
75    'type' => "flag",
76    'reqd' => "no", 
77        'hiddengli' => "no"},
78       { 'name' => "debug",
79     'desc' => "{BasPlugout.debug}",
80     'type' => "flag",
81     'reqd' => "no",
82     'hiddengli' => "yes"}
83];
84
85my $options = { 'name'     => "BasePlugout",
86        'desc'     => "{BasPlugout.desc}",
87        'abstract' => "yes",
88        'inherits' => "no",
89        'args'     => $arguments};
90
91sub new
92{
93    my $class = shift (@_);
94
95    my ($plugoutlist,$args,$hashArgOptLists) = @_;
96    push(@$plugoutlist, $class);
97
98    my $strPlugoutName = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
99
100    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
101    push(@{$hashArgOptLists->{"OptList"}},$options);
102
103    my $self = {};
104    $self->{'plugout_type'} = $class;
105    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
106    $self->{"info_only"} = 0;
107
108    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
109    # the args, just return the object. 
110    foreach my $strArg (@{$args})
111    {
112    if(defined $strArg && $strArg eq "-gsdlinfo")
113    {
114        $self->{"info_only"} = 1;
115        return bless $self, $class;
116    }
117    }
118   
119    delete $self->{"info_only"};
120   
121    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
122    {
123    my $classTempClass = bless $self, $class;
124    print STDERR "<BadPlugout d=$self->{'plugout_name'}>\n";
125    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $self->{'plugout_name'});
126    $classTempClass->print_txt_usage("");  # Use default resource bundle
127    die "\n";
128    }
129
130 
131    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
132    {
133      ##$self->{'xslt_file'} =~ s/\"//g;##working on Windows???
134    print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n" and  die "\n" unless (-e $self->{'xslt_file'});
135    }
136
137    $self->{'gs_count'} = 0;   
138
139    $self->{'keep_import_structure'} = 0;
140
141    return bless $self, $class;
142
143}
144
145sub print_xml_usage
146{
147    my $self = shift(@_);
148    my $header = shift(@_);
149    my $high_level_information_only = shift(@_);
150   
151    # XML output is always in UTF-8
152    gsprintf::output_strings_in_UTF8;
153
154    if ($header) {
155    &PrintUsage::print_xml_header("plugout");
156    }
157    $self->print_xml($high_level_information_only);
158}
159
160
161sub print_xml
162{
163    my $self = shift(@_);
164    my $high_level_information_only = shift(@_);
165
166    my $optionlistref = $self->{'option_list'};
167    my @optionlist = @$optionlistref;
168    my $plugoutoptions = shift(@$optionlistref);
169    return if (!defined($plugoutoptions));
170
171    gsprintf(STDERR, "<PlugoutInfo>\n");
172    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
173    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
174    $desc =~ s/</&amp;lt;/g; # doubly escaped
175    $desc =~ s/>/&amp;gt;/g;
176    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
177    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
178    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
179    unless (defined($high_level_information_only)) {
180    gsprintf(STDERR, "  <Arguments>\n");
181    if (defined($plugoutoptions->{'args'})) {
182        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
183    }
184    gsprintf(STDERR, "  </Arguments>\n");
185
186    # Recurse up the plugout hierarchy
187    $self->print_xml();
188    }
189    gsprintf(STDERR, "</PlugoutInfo>\n");
190}
191
192
193sub print_txt_usage
194{
195    my $self = shift(@_);
196
197    # Print the usage message for a plugout (recursively)
198    my $descoffset = $self->determine_description_offset(0);
199    $self->print_plugout_usage($descoffset, 1);
200}
201
202sub determine_description_offset
203{
204    my $self = shift(@_);
205    my $maxoffset = shift(@_);
206
207    my $optionlistref = $self->{'option_list'};
208    my @optionlist = @$optionlistref;
209    my $plugoutoptions = pop(@$optionlistref);
210    return $maxoffset if (!defined($plugoutoptions));
211
212    # Find the length of the longest option string of this download
213    my $plugoutargs = $plugoutoptions->{'args'};
214    if (defined($plugoutargs)) {
215    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
216    if ($longest > $maxoffset) {
217        $maxoffset = $longest;
218    }
219    }
220
221    # Recurse up the download hierarchy
222    $maxoffset = $self->determine_description_offset($maxoffset);
223    $self->{'option_list'} = \@optionlist;
224    return $maxoffset;
225}
226
227
228sub print_plugout_usage
229{
230    my $self = shift(@_);
231    my $descoffset = shift(@_);
232    my $isleafclass = shift(@_);
233
234    my $optionlistref = $self->{'option_list'};
235    my @optionlist = @$optionlistref;
236    my $plugoutoptions = shift(@$optionlistref);
237    return if (!defined($plugoutoptions));
238
239    my $plugoutname = $plugoutoptions->{'name'};
240    my $plugoutargs = $plugoutoptions->{'args'};
241    my $plugoutdesc = $plugoutoptions->{'desc'};
242
243    # Produce the usage information using the data structure above
244    if ($isleafclass) {
245    if (defined($plugoutdesc)) {
246        gsprintf(STDERR, "$plugoutdesc\n\n");
247    }
248    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
249    }
250
251    # Display the download options, if there are some
252    if (defined($plugoutargs)) {
253    # Calculate the column offset of the option descriptions
254    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
255
256    if ($isleafclass) {
257        gsprintf(STDERR, " {common.specific_options}:\n");
258    }
259    else {
260        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
261    }
262
263    # Display the download options
264    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
265    }
266
267    # Recurse up the download hierarchy
268    $self->print_plugout_usage($descoffset, 0);
269    $self->{'option_list'} = \@optionlist;
270}
271
272
273sub error
274{
275      my ($strFunctionName,$strError) = @_;
276    {
277    print "Error occoured in BasePlugout.pm\n".
278        "In Function: ".$strFunctionName."\n".
279        "Error Message: ".$strError."\n";
280    exit(-1);
281    } 
282}
283
284# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
285sub set_OIDtype {
286    my $self = shift (@_);
287    my ($type, $metadata) = @_;
288
289    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
290    $self->{'OIDtype'} = $type;
291    } else {
292    $self->{'OIDtype'} = "hash";
293    }
294    if ($type =~ /^assigned$/) {
295    if (defined $metadata) {
296        $self->{'OIDmetadata'} = $metadata;
297    } else {
298        $self->{'OIDmetadata'} = "dc.Identifier";
299    }
300    }
301}
302
303sub set_output_dir
304{
305    my $self = shift @_;
306    my ($output_dir) = @_;
307
308   $self->{'output_dir'} = $output_dir;
309}
310
311sub setoutputdir
312{
313    my $self = shift @_;
314    my ($output_dir) = @_;
315
316    $self->{'output_dir'} = $output_dir;
317}
318
319sub get_output_dir
320{
321    my $self = shift (@_);
322
323   return $self->{'output_dir'};
324}
325
326sub getoutputdir
327{
328    my $self = shift (@_);
329
330    return $self->{'output_dir'};
331}
332
333sub getoutputinfo
334{
335    my $self = shift (@_);
336
337    return $self->{'output_info'};
338}
339
340
341sub get_output_handler
342{
343    my $self = shift (@_);
344
345    my ($output_file_name) = @_;
346     
347    open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for  $output_file_name\n";
348
349    return *OUTPUT;           
350}
351
352sub release_output_handler
353{
354    my $self = shift (@_);
355    my ($outhandler) = @_;
356
357    close($outhandler);
358
359}
360
361sub output_xml_header {
362    my $self = shift (@_);
363    my ($handle,$docroot,$nondoctype) = @_;
364   
365    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
366   
367    if (!defined $nondoctype){
368    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
369    }
370
371    print $handle "<$docroot>\n" if defined $docroot;
372}
373
374sub output_xml_footer {
375    my $self = shift (@_);
376    my ($handle,$docroot) = @_;
377    print $handle "</$docroot>\n" if defined $docroot;
378}
379
380sub process {
381    my $self = shift (@_);
382    my ($doc_obj) = @_;
383   
384    $doc_obj->set_lastmodified();
385
386     if ($self->{'group_size'} > 1) {
387     $self->group_process ($doc_obj);
388    return;
389    }
390
391    my $OID = $doc_obj->get_OID();
392    $OID = "NULL" unless defined $OID;     
393
394    my $top_section = $doc_obj->get_top_section();
395
396    #get document's directory
397    my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
398   
399    my $output_info = $self->{'output_info'};
400    return if (!defined $output_info);
401     
402    ##############################
403    # call subclass' saveas method
404    ##############################
405    $self->saveas($doc_obj,$doc_dir);
406    $self->archiveinf_gdbm($doc_obj,$doc_dir);
407
408}
409
410sub store_output_info_reference {
411    my $self = shift (@_);
412    my ($doc_obj) = @_;
413
414    my $output_info = $self->{'output_info'};
415    my $metaname = $self->{'sortmeta'};
416    if (!defined $metaname || $metaname !~ /\S/) {
417    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
418    return;
419    }
420   
421    my $metadata = "";
422    my $top_section = $doc_obj->get_top_section();
423   
424    my @commameta_list = split(/,/, $metaname);
425    foreach my $cmn (@commameta_list) {
426    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
427    if ($meta) {
428        # do remove prefix/suffix - this will apply to all values
429        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
430        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
431        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
432        $metadata .= $meta if ($meta);
433    }
434    }
435
436    # store reference in the output_info     
437    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
438   
439}
440
441sub group_process {
442
443    my $self = shift (@_);
444    my ($doc_obj) = @_;
445   
446    my $OID = $doc_obj->get_OID();
447    $OID = "NULL" unless defined $OID;
448
449    my $groupsize = $self->{'group_size'};
450    my $gs_count = $self->{'gs_count'};
451    my $open_new_file = (($gs_count % $groupsize)==0);
452    my $outhandle = $self->{'output_handle'};
453
454    # opening a new file, or document has assoicated files => directory needed
455    if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
456         
457        # The directory the archive file (doc.xml) and all associated files
458        # should end up in
459        my $doc_dir;
460        # If we've determined its time for a new file, open it now
461        if ($open_new_file || !defined($self->{'gs_doc_dir'}))
462          {
463            $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
464            # only if opening new file
465        my $output_dir = $self->get_output_dir();
466        &util::mk_all_dir ($output_dir) unless -e $output_dir;
467        my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
468        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
469       
470        if ($gs_count>0)
471        {
472        return if (!$self->close_file_output());
473        }
474
475        open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
476           
477
478        $self->{'gs_filename'} = $doc_file;
479        $self->{'short_doc_file'} = $short_doc_file;
480        $self->{'gs_OID'} = $OID;
481            $self->{'gs_doc_dir'} = $doc_dir;
482
483        $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
484    }
485        # Otherwise load the same archive document directory used last time
486        else
487          {
488            $doc_dir = $self->{'gs_doc_dir'};
489          }
490
491    # copy all the associated files, add this information as metadata
492    # to the document
493        print $outhandle "Writing associated files to $doc_dir\n";
494    $self->process_assoc_files ($doc_obj, $doc_dir);
495    }
496
497    # save this document
498    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
499    print GROUPPROCESS $section_text;
500
501    $self->{'gs_count'}++;
502}
503
504
505sub saveas {
506    my $self = shift (@_);
507   
508    die "Basplug::saveas function must be implemented in sub classes\n";
509}
510
511sub get_doc_dir {
512    my $self = shift (@_);
513    my ($OID, $source_filename) = @_;
514
515    my $working_dir  = $self->get_output_dir();
516    my $working_info = $self->{output_info};
517    return if (!defined $working_info);
518
519    my $doc_info = $working_info->get_info($OID);
520    my $doc_dir = '';
521
522    if (defined $doc_info && scalar(@$doc_info) >= 1)
523    {
524    # This OID already has an archives directory, so use it again
525    $doc_dir = $doc_info->[0];
526    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
527    }
528    elsif ($self->{'keep_import_structure'})
529    {
530    $source_filename = &File::Basename::dirname($source_filename);
531    $source_filename =~ s/[\\\/]+/\//g;
532    $source_filename =~ s/\/$//;
533
534        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
535    }
536
537    # We have to use a new archives directory for this document
538    if ($doc_dir eq "")
539    {
540    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
541    }
542
543    if (!defined $self->{'group'} || !$self->{'group'}){
544    &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
545    }
546
547    return $doc_dir;
548}
549
550sub get_new_doc_dir{
551   my $self = shift (@_); 
552   my($working_info,$working_dir,$OID) = @_;     
553   
554   
555   my $doc_dir = "";
556   my $doc_dir_rest = $OID;
557   # remove any \ and / from the OID
558   $doc_dir_rest =~ s/[\\\/]//g;
559   my $doc_dir_num = 0;
560
561   do {
562       $doc_dir .= "/" if $doc_dir_num > 0;
563       if ($doc_dir_rest =~ s/^(.{1,8})//) {
564       $doc_dir .= $1;
565       $doc_dir_num++;
566       }
567   } while ($doc_dir_rest ne "" &&
568        ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
569         ($working_info->size() >= 1024 && $doc_dir_num < 2)));
570   my $i = 1;
571   my $doc_dir_base = $doc_dir;
572   while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
573       $doc_dir = "$doc_dir_base-$i";
574       $i++;
575   }
576         
577   return "$doc_dir.dir";
578}
579
580sub process_assoc_files {
581    my $self = shift (@_);
582    my ($doc_obj, $doc_dir, $handle) = @_;
583
584    my $outhandle = $self->{'output_handle'};
585   
586    my $output_dir = $self->get_output_dir();
587    return if (!defined $output_dir);
588
589    &util::mk_all_dir ($output_dir) unless -e $output_dir;
590     
591    my $working_dir = &util::filename_cat($output_dir, $doc_dir);
592    &util::mk_all_dir ($working_dir) unless -e $working_dir;
593
594    my @assoc_files = ();
595    my $filename;;
596
597    my $source_filename = $doc_obj->get_source_filename();
598
599    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
600
601    if (defined $collect_dir) {
602    my $dirsep_regexp = &util::get_os_dirsep();
603
604    if ($collect_dir !~ /$dirsep_regexp$/) {
605        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
606    }
607
608    # This test is never going to fail on Windows -- is this a problem?
609     
610    if ($source_filename !~ /^$dirsep_regexp/) {
611        $source_filename = &util::filename_cat($collect_dir, $source_filename);
612    }
613    }
614
615
616    # set the assocfile path (even if we have no assoc files - need this for lucene)
617    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
618                     "assocfilepath",
619                     "$doc_dir");
620    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
621    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
622    $dir = "" unless defined $dir;
623       
624   
625    my $real_filename = $assoc_file_rec->[0];
626    # for some reasons the image associate file has / before the full path
627    $real_filename =~ s/^\\(.*)/$1/i;
628    if (-e $real_filename) {
629
630        $filename = &util::filename_cat($working_dir, $afile);
631
632        &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
633       
634        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
635                     "gsdlassocfile",
636                     "$afile:$assoc_file_rec->[2]:$dir");
637    } elsif ($self->{'verbosity'} > 2) {
638        print $outhandle "BasePlugout::process couldn't copy the associated file " .
639        "$real_filename to $afile\n";
640    }
641    }
642}
643
644
645sub archiveinf_gdbm
646{
647    my $self = shift (@_);
648    my ($doc_obj) = @_;
649
650    my $verbosity = $self->{'verbosity'};
651
652    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
653    if (defined $collect_dir) {
654    my $dirsep_regexp = &util::get_os_dirsep();
655
656    if ($collect_dir !~ /$dirsep_regexp$/) {
657        # ensure there is a slash at the end
658        $collect_dir .= &util::get_dirsep();
659    }
660    }
661
662    my $oid = $doc_obj->get_OID();
663    my $source_filename = $doc_obj->get_source_filename();
664
665    my $working_info = $self->{'output_info'};
666    my $doc_info = $working_info->get_info($oid);
667    my ($doc_file,$index_status) = @$doc_info;
668
669    my $oid_files = { 'doc-file' => $doc_file,
670              'index-status' => $index_status,
671              'src-file' => $source_filename,
672              'assoc-files' => [] };
673   
674    my %reverse_lookups = ( $source_filename => "1" );
675
676    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
677    my $real_filename = $assoc_file_rec->[0];
678    my $full_afile = $assoc_file_rec->[1];
679
680    # for some reasons the image associate file has / before the full path
681    $real_filename =~ s/^\\(.*)/$1/i;
682    if (-e $real_filename) {
683
684        if (defined $collect_dir) {
685        my $collect_dir_re_safe = $collect_dir;
686        $collect_dir_re_safe =~ s/\\/\\\\/g;
687        $collect_dir_re_safe =~ s/\./\\./g;
688
689        $real_filename =~ s/^$collect_dir_re_safe//;
690        }
691
692        $reverse_lookups{$real_filename} = 1;
693
694        push(@{$oid_files->{'assoc-files'}},$full_afile);
695
696    }
697    else {
698        print STDERR "Warning: archiveinf_gdbm()\n  $real_filename does not appear to be on the file system\n";
699    }
700    }
701
702    # better not to commit to a particular db implementation, but
703    # for simplicity, will use GDBM for now.
704
705    my $output_dir = $self->{'output_dir'};
706
707    my $doc_db = &util::filename_cat($output_dir,"archiveinf-doc.gdb");
708    my $src_db = &util::filename_cat($output_dir,"archiveinf-src.gdb");
709
710    my $doc_db_text = "";
711    $doc_db_text .= "<doc-file>$oid_files->{'doc-file'}\n";
712    $doc_db_text .= "<index-status>$oid_files->{'index-status'}\n";
713    $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n";
714    foreach my $af (@{$oid_files->{'assoc-files'}}) {
715    $doc_db_text .= "<assoc-file>$af\n";
716    }
717    chomp($doc_db_text); # remove trailing \n
718
719    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
720    &GDBMUtils::gdbmDatabaseSet($doc_db,$oid,$doc_db_text);
721
722    foreach my $rl (keys %reverse_lookups) {
723    &GDBMUtils::gdbmDatabaseAppend($src_db,$rl,"<oid>$oid\n");
724    }
725   
726}
727
728
729sub set_sortmeta {
730    my $self = shift (@_);
731    my ($sortmeta, $removeprefix, $removesuffix) = @_;
732   
733    $self->{'sortmeta'} = $sortmeta;
734    if (defined ($removeprefix) && $removeprefix ) {
735    $removeprefix =~ s/^\^//; # don't need a leading ^
736    $self->{'removeprefix'} = $removeprefix;
737    }
738    if (defined ($removesuffix) && $removesuffix) {
739    $removesuffix =~ s/\$$//; # don't need a trailing $
740    $self->{'removesuffix'} = $removesuffix;
741    }
742}
743
744sub open_xslt_pipe
745{
746    my $self = shift @_;
747    my ($output_file_name, $xslt_file)=@_;
748
749   return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
750
751    my $java_class_path =  &util::filename_cat ($ENV{'GSDLHOME'},"bin","java");
752
753    my $mapping_file_path = "";
754
755    if ($ENV{'GSDLOS'} eq "windows"){
756    $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
757    $xslt_file = "\"file:///".$xslt_file."\"";
758    $mapping_file_path = "\"file:///";
759    }
760    else{
761    $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
762    }
763
764
765    $java_class_path = "\"".$java_class_path."\"";
766
767    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t $xslt_file ";
768
769    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
770    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
771    $cmd .= "-m $mapping_file_path";
772    }
773
774    open(*XMLWRITER, $cmd)
775    or die "can't open pipe to xslt: $!";
776
777   
778    $self->{'xslt_writer'} = *XMLWRITER;
779
780    print XMLWRITER "<?DocStart?>\n";       
781    print XMLWRITER "$output_file_name\n";
782
783 
784  }
785 
786
787sub close_xslt_pipe
788{
789  my $self = shift @_;
790
791 
792  return unless defined $self->{'xslt_writer'} ;
793   
794  my $xsltwriter = $self->{'xslt_writer'};
795 
796  print $xsltwriter "<?DocEnd?>\n";
797  close($xsltwriter);
798
799  undef $self->{'xslt_writer'};
800
801}
802
803sub close_file_output
804{
805    my ($self) = @_;
806 
807    # make sure that the handle has been opened - it won't be if we failed
808    # to import any documents...
809    if (defined(fileno(GROUPPROCESS))) {
810    $self->output_xml_footer('GROUPPROCESS','Archive');   
811    close GROUPPROCESS;
812    }
813
814    my $OID = $self->{'gs_OID'};
815    my $short_doc_file = $self->{'short_doc_file'};
816   
817    if ($self->{'gzip'}) {
818    my $doc_file = $self->{'gs_filename'};
819    `gzip $doc_file`;
820    $doc_file .= ".gz";
821    $short_doc_file .= ".gz";
822    if (!-e $doc_file) {
823         my $outhandle = $self->{'output_handle'};
824        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
825        return 0;
826    }
827    }
828
829    # store reference in output_info
830    my $output_info = $self->{'output_info'};
831    return 0 if (!defined $output_info);
832    $output_info->add_info($OID, $short_doc_file, undef, undef);
833    return 1;
834}
835
836
837#the subclass should implement this method if is_group method could return 1.
838sub close_group_output{
839   my $self = shift (@_);       
840}
841
842sub is_group {
843    my $self = shift (@_);
844    return 0;       
845}
846
847my $dc_set = { Title => 1,       
848           Creator => 1,
849           Subject => 1,
850           Description => 1,
851           Publisher => 1,
852           Contributor => 1,
853           Date => 1,
854           Type => 1,
855           Format => 1,
856           Identifier => 1,
857           Source => 1,
858           Language => 1,
859           Relation => 1,
860           Coverage => 1,
861           Rights => 1};
862
863
864# returns an XML representation of the dublin core metadata
865# if dc meta is not found, try ex mete
866sub get_dc_metadata {
867    my $self = shift(@_);
868    my ($doc_obj, $section, $version) = @_;
869   
870    # build up string of dublin core metadata
871    $section="" unless defined $section;
872   
873    my $section_ptr = $doc_obj->_lookup_section($section);
874    return "" unless defined $section_ptr;
875
876
877    my $explicit_dc = {};
878    my $explicit_ex = {};
879
880    my $all_text="";
881    foreach my $data (@{$section_ptr->{'metadata'}}){
882    my $escaped_value = &docprint::escape_text($data->[1]);
883    if ($data->[0]=~ m/^dc\./) {
884        $data->[0] =~ tr/[A-Z]/[a-z]/;
885
886        $data->[0] =~ m/^dc\.(.*)/;
887        my $dc_element =  $1;
888
889        if (!defined $explicit_dc->{$dc_element}) {
890        $explicit_dc->{$dc_element} = [];
891        }
892        push(@{$explicit_dc->{$dc_element}},$escaped_value);
893
894        if (defined $version && ($version eq "oai_dc")) {
895        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
896        }
897        else {
898        # qualifier???
899        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
900        }
901
902    }
903    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
904        $data->[0] =~ m/^(ex\.)?(.*)/;
905        my $ex_element =  $2;
906        my $lc_ex_element = lc($ex_element);
907
908        if (defined $dc_set->{$ex_element}) {
909        if (!defined $explicit_ex->{$lc_ex_element}) {
910            $explicit_ex->{$lc_ex_element} = [];
911        }
912        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
913        }
914    }
915    }
916
917    # go through dc_set and for any element *not* defined in explicit_dc
918    # that does exist in explicit_ex, add it in as metadata
919    foreach my $k ( keys %$dc_set ) {
920    my $lc_k = lc($k);
921
922    if (!defined $explicit_dc->{$lc_k}) {
923        if (defined $explicit_ex->{$lc_k}) {
924
925        foreach my $v (@{$explicit_ex->{$lc_k}}) {
926            my $dc_element    = $lc_k;
927            my $escaped_value = $v;
928
929            if (defined $version && ($version eq "oai_dc")) {
930            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
931            }
932            else {
933            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
934            }
935           
936        }
937        }
938    }
939    }
940
941    if ($all_text eq "") {
942    $all_text .= "   There is no Dublin Core metatdata in this document\n";
943    }   
944    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
945
946    return $all_text;
947}
948
949# Build up dublin_core metadata.  Priority given to dc.* over ex.*
950# This method was apparently added by Jeffrey and committed by Shaoqun.
951# But we don't know why it was added, so not using it anymore.
952sub new_get_dc_metadata {
953   
954    my $self = shift(@_);
955    my ($doc_obj, $section, $version) = @_;
956
957    # build up string of dublin core metadata
958    $section="" unless defined $section;
959   
960    my $section_ptr=$doc_obj->_lookup_section($section);
961    return "" unless defined $section_ptr;
962
963    my $all_text = "";
964    foreach my $data (@{$section_ptr->{'metadata'}}){
965    my $escaped_value = &docprint::escape_text($data->[1]);
966    my $dc_element =  $data->[0];
967   
968    my @array = split('\.',$dc_element);
969    my ($type,$name);
970
971    if(defined $array[1])
972    {
973        $type = $array[0];
974        $name = $array[1];
975    }
976    else
977    {
978        $type = "ex";
979        $name = $array[0];
980    }
981   
982    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
983    }
984    return $all_text;
985}
986
987
9881;
Note: See TracBrowser for help on using the browser.