root/main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm @ 32159

Revision 32159, 37.7 KB (checked in by ak19, 3 years ago)

incremental building was not being incremental when no metadata was assigned to any of the files (as happens with our docs for quick test collections). A default metadata.xml is present, but 'empty' in that it contains no FileSet? elements with metadata elements assigned to FileName? elements. But we still want incremental behaviour. The idea was to write out an entry into archiveinf-src.db for each metadata.xml processed, not just for each meta.xml file actually referencing a doc, as BasePlugout? was doing so far on a per doc basis. Kathy come up with the actual infrastructure that can make it work (to ensure all the necessary objects are available), Dr Bainbridge approved this, and it's now been added into the code.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40use sorttools;
41
42# suppress the annoying "subroutine redefined" warning that various
43# gets cause under perl 5.6
44$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
45
46my $arguments = [
47       { 'name' => "xslt_file",
48    'desc' => "{BasPlugout.xslt_file}",
49    'type' => "string",
50    'reqd' => "no",
51     'deft' => "",
52    'hiddengli' => "no"},
53       { 'name' => "subdir_split_length",
54     'desc' => "{BasPlugout.subdir_split_length}",
55     'type' => "int",
56     'reqd' => "no",
57         'deft' => "8",
58     'hiddengli' => "no"},
59       { 'name' => "subdir_hash_prefix",
60     'desc' => "{BasPlugout.subdir_hash_prefix}",
61     'type' => "flag",
62     'reqd' => "no",
63         'deft' => "0",
64     'hiddengli' => "no"},
65       { 'name' => "gzip_output",
66    'desc' => "{BasPlugout.gzip_output}",
67    'type' => "flag",
68    'reqd' => "no", 
69        'hiddengli' => "no"},
70        { 'name' => "verbosity",
71    'desc' => "{BasPlugout.verbosity}",
72    'type' => "int",
73        'deft' =>  "0",
74    'reqd' => "no", 
75        'hiddengli' => "no"},
76      { 'name' => "output_info",
77    'desc' => "{BasPlugout.output_info}",
78    'type' => "string",   
79    'reqd' => "yes",
80    'hiddengli' => "yes"},       
81       { 'name' => "output_handle",
82    'desc' => "{BasPlugout.output_handle}",
83    'type' => "string",
84        'deft' =>  'STDERR',
85    'reqd' => "no",
86    'hiddengli' => "yes"},
87       { 'name' => "debug",
88     'desc' => "{BasPlugout.debug}",
89     'type' => "flag",
90     'reqd' => "no",
91     'hiddengli' => "yes"},
92       { 'name' => 'no_rss',
93         'desc' => "{BasPlugout.no_rss}",
94         'type' => 'flag',
95         'reqd' => 'no',
96         'hiddengli' => 'yes'},
97       { 'name' => 'rss_title',
98         'desc' => "{BasPlugout.rss_title}",
99         'type' => 'string',
100     'deft' => 'dc.Title',
101         'reqd' => 'no',
102         'hiddengli' => 'yes'},
103    { 'name' => "no_auxiliary_databases",
104      'desc' => "{BasPlugout.no_auxiliary_databases}",
105      'type' => "flag",
106      'reqd' => "no",
107      'hiddengli' => "yes"}
108
109];
110
111my $options = { 'name'     => "BasePlugout",
112        'desc'     => "{BasPlugout.desc}",
113        'abstract' => "yes",
114        'inherits' => "no",
115        'args'     => $arguments};
116
117sub new
118{
119    my $class = shift (@_);
120
121    my ($plugoutlist,$args,$hashArgOptLists) = @_;
122    push(@$plugoutlist, $class);
123
124    my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
125
126    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
127    push(@{$hashArgOptLists->{"OptList"}},$options);
128
129    my $self = {};
130    $self->{'plugout_type'} = $class;
131    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
132    $self->{"info_only"} = 0;
133
134    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
135    # the args, just return the object. 
136    foreach my $strArg (@{$args})
137    {
138    if(defined $strArg && $strArg eq "-gsdlinfo")
139    {
140        $self->{"info_only"} = 1;
141        return bless $self, $class;
142    }
143    }
144   
145    delete $self->{"info_only"};
146   
147    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
148    {
149    my $classTempClass = bless $self, $class;
150    print STDERR "<BadPlugout d=$plugout_name>\n";
151    &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
152    $classTempClass->print_txt_usage("");  # Use default resource bundle
153    die "\n";
154    }
155
156 
157    if(defined $self->{'xslt_file'} &&  $self->{'xslt_file'} ne "")
158    {
159    my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
160    if (!defined $full_file_path) {
161        print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path or put the file into the collection's etc or greenstone's etc folder\n";
162        die "\n";
163    }
164    $self->{'xslt_file'} = $full_file_path;
165    }
166
167    # for group processing
168    $self->{'gs_count'} = 0;
169    $self->{'group_position'} = 1;
170
171    $self->{'keep_import_structure'} = 0;
172
173    $self->{'generate_databases'} = 1;
174    if ($self->{'no_auxiliary_databases'}) {
175    $self->{'generate_databases'} = 0;
176    }
177    undef $self->{'no_auxiliary_databases'};
178    return bless $self, $class;
179
180}
181
182# implement this in subclass if you want to do some initialization after
183# loading and setting parameters, and before processing the documents.
184sub begin {
185
186    my $self= shift (@_);
187
188}
189sub print_xml_usage
190{
191    my $self = shift(@_);
192    my $header = shift(@_);
193    my $high_level_information_only = shift(@_);
194
195    # XML output is always in UTF-8
196    gsprintf::output_strings_in_UTF8;
197
198    if ($header) {
199    &PrintUsage::print_xml_header("plugout");
200    }
201    $self->print_xml($high_level_information_only);
202}
203
204
205sub print_xml
206{
207    my $self = shift(@_);
208    my $high_level_information_only = shift(@_);
209
210    my $optionlistref = $self->{'option_list'};
211    my @optionlist = @$optionlistref;
212    my $plugoutoptions = shift(@$optionlistref);
213    return if (!defined($plugoutoptions));
214
215    gsprintf(STDERR, "<PlugoutInfo>\n");
216    gsprintf(STDERR, "  <Name>$plugoutoptions->{'name'}</Name>\n");
217    my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
218    $desc =~ s/</&amp;lt;/g; # doubly escaped
219    $desc =~ s/>/&amp;gt;/g;
220    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
221    gsprintf(STDERR, "  <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
222    gsprintf(STDERR, "  <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
223    unless (defined($high_level_information_only)) {
224    gsprintf(STDERR, "  <Arguments>\n");
225    if (defined($plugoutoptions->{'args'})) {
226        &PrintUsage::print_options_xml($plugoutoptions->{'args'});
227    }
228    gsprintf(STDERR, "  </Arguments>\n");
229
230    # Recurse up the plugout hierarchy
231    $self->print_xml();
232    }
233    gsprintf(STDERR, "</PlugoutInfo>\n");
234}
235
236
237sub print_txt_usage
238{
239    my $self = shift(@_);
240
241    # Print the usage message for a plugout (recursively)
242    my $descoffset = $self->determine_description_offset(0);
243    $self->print_plugout_usage($descoffset, 1);
244}
245
246sub determine_description_offset
247{
248    my $self = shift(@_);
249    my $maxoffset = shift(@_);
250
251    my $optionlistref = $self->{'option_list'};
252    my @optionlist = @$optionlistref;
253    my $plugoutoptions = pop(@$optionlistref);
254    return $maxoffset if (!defined($plugoutoptions));
255
256    # Find the length of the longest option string of this download
257    my $plugoutargs = $plugoutoptions->{'args'};
258    if (defined($plugoutargs)) {
259    my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
260    if ($longest > $maxoffset) {
261        $maxoffset = $longest;
262    }
263    }
264
265    # Recurse up the download hierarchy
266    $maxoffset = $self->determine_description_offset($maxoffset);
267    $self->{'option_list'} = \@optionlist;
268    return $maxoffset;
269}
270
271
272sub print_plugout_usage
273{
274    my $self = shift(@_);
275    my $descoffset = shift(@_);
276    my $isleafclass = shift(@_);
277
278    my $optionlistref = $self->{'option_list'};
279    my @optionlist = @$optionlistref;
280    my $plugoutoptions = shift(@$optionlistref);
281    return if (!defined($plugoutoptions));
282
283    my $plugoutname = $plugoutoptions->{'name'};
284    my $plugoutargs = $plugoutoptions->{'args'};
285    my $plugoutdesc = $plugoutoptions->{'desc'};
286
287    # Produce the usage information using the data structure above
288    if ($isleafclass) {
289    if (defined($plugoutdesc)) {
290        gsprintf(STDERR, "$plugoutdesc\n\n");
291    }
292    gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
293    }
294
295    # Display the download options, if there are some
296    if (defined($plugoutargs)) {
297    # Calculate the column offset of the option descriptions
298    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
299
300    if ($isleafclass) {
301        gsprintf(STDERR, " {common.specific_options}:\n");
302    }
303    else {
304        gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
305    }
306
307    # Display the download options
308    &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
309    }
310
311    # Recurse up the download hierarchy
312    $self->print_plugout_usage($descoffset, 0);
313    $self->{'option_list'} = \@optionlist;
314}
315
316
317sub error
318{
319      my ($strFunctionName,$strError) = @_;
320    {
321    print "Error occoured in BasePlugout.pm\n".
322        "In Function: ".$strFunctionName."\n".
323        "Error Message: ".$strError."\n";
324    exit(-1);
325    } 
326}
327
328# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
329sub set_OIDtype {
330    my $self = shift (@_);
331    my ($type, $metadata) = @_;
332
333    if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
334    $self->{'OIDtype'} = $type;
335    } else {
336    $self->{'OIDtype'} = "hash";
337    }
338    if ($type =~ /^assigned$/) {
339    if (defined $metadata) {
340        $self->{'OIDmetadata'} = $metadata;
341    } else {
342        $self->{'OIDmetadata'} = "dc.Identifier";
343    }
344    }
345}
346
347sub set_output_dir
348{
349    my $self = shift @_;
350    my ($output_dir) = @_;
351
352   $self->{'output_dir'} = $output_dir;
353}
354
355sub setoutputdir
356{
357    my $self = shift @_;
358    my ($output_dir) = @_;
359
360    $self->{'output_dir'} = $output_dir;
361}
362
363sub get_output_dir
364{
365    my $self = shift (@_);
366
367   return $self->{'output_dir'};
368}
369
370sub getoutputdir
371{
372    my $self = shift (@_);
373
374    return $self->{'output_dir'};
375}
376
377sub getoutputinfo
378{
379    my $self = shift (@_);
380
381    return $self->{'output_info'};
382}
383
384
385sub get_output_handler
386{
387    my $self = shift (@_);
388
389    my ($output_file_name) = @_;
390
391    my $fh;
392    &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
393
394    return $fh;
395}
396
397sub release_output_handler
398{
399    my $self = shift (@_);
400    my ($outhandler) = @_;
401
402    close($outhandler);
403
404}
405
406sub output_xml_header {
407    my $self = shift (@_);
408    my ($handle,$docroot,$nondoctype) = @_;
409   
410   
411    #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
412   
413    #For Dspace must be UTF in lower case
414    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
415   
416    if (!defined $nondoctype){
417    my $doctype = (defined $docroot) ? $docroot : "Section";
418
419    # Used to be '<!DOCTYPE Archive SYSTEM ...'
420   
421    print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
422    }
423
424    print $handle "<$docroot>\n" if defined $docroot;
425}
426
427sub output_xml_footer {
428    my $self = shift (@_);
429    my ($handle,$docroot) = @_;
430    print $handle "</$docroot>\n" if defined $docroot;
431}
432
433
434sub output_general_xml_header
435{
436    my $self = shift (@_);
437    my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
438   
439    print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
440   
441    if (defined $opt_dtd) {
442        my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
443    print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
444    }
445
446    if (defined $docroot) {
447        my $full_docroot = $docroot;
448        if (defined $opt_attributes) {
449          $full_docroot .= " $opt_attributes";
450        }
451
452        print $handle "<$full_docroot>\n"
453      }
454}
455
456sub output_general_xml_footer
457{
458    output_xml_footer(@_);
459}
460
461# This is called by the plugins after read_into_doc_obj generates the doc_obj.
462sub process {
463    my $self = shift (@_);
464    my ($doc_obj) = @_;
465
466    my $output_info = $self->{'output_info'};
467    return if (!defined $output_info);
468
469    # for OAI purposes
470    $doc_obj->set_lastmodified();
471    $doc_obj->set_oailastmodified();
472
473    # find out which directory to save to
474    my $doc_dir = "";
475    if ($self->is_group()) {
476    $doc_dir = $self->get_group_doc_dir($doc_obj);     
477    } else {
478    $doc_dir = $self->get_doc_dir($doc_obj);
479    }
480     
481    ##############################
482    # call subclass' saveas method
483    ##############################
484    $self->saveas($doc_obj,$doc_dir);
485
486    # write out data to archiveinf-doc.db
487    if ($self->{'generate_databases'}) {
488    $self->archiveinf_db($doc_obj);
489    }
490    if ($self->is_group()) {
491    $self->{'gs_count'}++; # do we want this for all cases?
492    $self->{'group_position'}++;
493    }
494}
495
496sub store_output_info_reference {
497    my $self = shift (@_);
498    my ($doc_obj) = @_;
499
500    my $output_info = $self->{'output_info'};
501    my $metaname = $self->{'sortmeta'};
502
503    my $group_position;
504    if ($self->is_group()) {
505    $group_position = $self->{'group_position'};
506    }
507    if (!defined $metaname || $metaname !~ /\S/) {
508    my $OID = $doc_obj->get_OID();
509    $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
510    return;
511    }
512   
513    if ($metaname eq "OID") { # sort by OID
514    my $OID = $doc_obj->get_OID();
515    $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
516    return;
517    }
518   
519    my $metadata = "";
520    my $top_section = $doc_obj->get_top_section();
521   
522    my @commameta_list = split(/,/, $metaname);
523    foreach my $cmn (@commameta_list) {
524    my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
525    if ($meta) {
526        # do remove prefix/suffix - this will apply to all values
527        $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};         
528        $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
529        $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
530        $metadata .= $meta if ($meta);
531    }
532    }
533
534    # store reference in the output_info     
535    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
536   
537}
538
539
540
541sub saveas {
542    my $self = shift (@_);
543    my ($doc_obj, $doc_dir) = @_;
544   
545    die "Basplug::saveas function must be implemented in sub classes\n";
546}
547
548sub get_group_doc_dir {
549    my $self = shift (@_);
550    my ($doc_obj) = @_;
551
552    my $outhandle = $self->{'output_handle'};
553    my $OID = $doc_obj->get_OID();
554    $OID = "NULL" unless defined $OID;
555
556    my $groupsize = $self->{'group_size'};
557    my $gs_count = $self->{'gs_count'};
558    my $open_new_file = (($gs_count % $groupsize)==0);
559
560    my $doc_dir;
561
562    if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
563    # if we have some assoc files, then we will need to start a new file
564    if ($self->{'verbosity'} > 2) {
565        print $outhandle " Starting a archives folder for $OID as it has associated files\n";
566    }
567    $open_new_file = 1;
568    }
569   
570    # opening a new file
571    if (($open_new_file)  || !defined($self->{'gs_doc_dir'})) {
572    # first we close off the old output
573    if ($gs_count>0)
574    {
575        return if (!$self->close_group_output());
576    }
577
578    # this will create the directory
579    $doc_dir = $self->get_doc_dir ($doc_obj);
580    $self->{'new_doc_dir'} = 1;
581    $self->{'gs_doc_dir'} = $doc_dir;
582    $self->{'group_position'} = 1;
583    }
584    else {
585    $doc_dir = $self->{'gs_doc_dir'};
586    $self->{'new_doc_dir'} = 0;
587    }
588    return $doc_dir;
589
590}
591sub get_doc_dir {
592   
593    my $self = shift (@_);
594    my ($doc_obj) = @_;
595
596    my $OID = $doc_obj->get_OID();
597    $OID = "NULL" unless defined $OID;
598
599    my $working_dir  = $self->get_output_dir();
600    my $working_info = $self->{'output_info'};
601    return if (!defined $working_info);
602
603    my $doc_info = $working_info->get_info($OID);
604    my $doc_dir = '';
605
606    if (defined $doc_info && scalar(@$doc_info) >= 1)
607    {
608    # This OID already has an archives directory, so use it again
609    $doc_dir = $doc_info->[0];
610    $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
611    }
612    elsif ($self->{'keep_import_structure'})
613    {
614    my $source_filename = $doc_obj->get_source_filename();
615    $source_filename = &File::Basename::dirname($source_filename);
616    $source_filename =~ s/[\\\/]+/\//g;
617    $source_filename =~ s/\/$//;
618
619        $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
620    }
621
622    # We have to use a new archives directory for this document
623    if ($doc_dir eq "")
624    {
625    $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
626    }
627
628    &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
629
630    return $doc_dir;
631}
632
633
634## @function get_new_doc_dir()
635#
636# Once a doc object is ready to write to disk (and hence has a nice OID),
637# generate a unique subdirectory to write the information to.
638#
639# - create the directory as part of this call, to try and avoid race conditions
640#   found in parallel processing [jmt12]
641#
642# @todo figure out what the rule regarding $work_info->size() is meant to do
643#
644# @todo determine what $self->{'group'} is, and whether it should affect
645#       directory creation
646#
647sub get_new_doc_dir
648{
649  my $self = shift (@_);
650  my($working_info,$working_dir,$OID) = @_;
651
652  my $doc_dir = "";
653  my $doc_dir_rest = $OID;
654
655  # remove any \ and / from the OID
656  $doc_dir_rest =~ s/[\\\/]//g;
657
658  # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
659  if ($ENV{'GSDLOS'} =~ /^windows$/i)
660  {
661    $doc_dir_rest =~ s/\://g;
662  }
663
664  # we generally create a unique directory by adding consequtive fragments of
665  # the document identifier (split by some predefined length - defaulting to
666  # 8) until we find a directory that doesn't yet exist. Note that directories
667  # that contain a document have a suffix ".dir" (whereas those that contain
668  # only subdirectories have no suffix).
669  my $doc_dir_num = 0; # how many directories deep we are
670  my $created_directory = 0; # have we successfully created a new directory
671  do
672  {
673    # (does this work on windows? - jmt12)
674    if ($doc_dir_num > 0)
675    {
676      $doc_dir .= '/';
677    }
678
679    # the default matching pattern grabs the next 'subdir_split_length'
680    # characters of the OID to act as the next subdirectory
681    my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
682
683    # Do we count any "HASH" prefix against the split length limit?
684    if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
685    {
686      $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
687    }
688
689    # Note the use of 's' to both capture the next chuck of OID and to remove
690    # it from OID at the same time
691    if ($doc_dir_rest =~ s/$pattern//i)
692    {
693      $doc_dir .= $1;
694      $doc_dir_num++;
695
696      my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
697      if(!FileUtils::directoryExists($full_doc_dir))
698      {
699        &FileUtils::makeAllDirectories($full_doc_dir);
700        $created_directory = 1;
701      }
702
703      ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
704      ###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
705      ###rint STDERR " - rest: $doc_dir_rest\n";
706      ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
707      ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
708    }
709  }
710  while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
711
712  # not unique yet? Add on an incremental suffix until we are unique
713  my $i = 1;
714  my $doc_dir_base = $doc_dir;
715  while ($created_directory == 0)
716  {
717    $doc_dir = $doc_dir_base . '-' . $i;
718    $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
719    $i++;
720  }
721
722  # in theory this should never happen
723  if (!$created_directory)
724  {
725    die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
726  }
727
728  return $doc_dir . '.dir';
729}
730## get_new_doc_dir()
731
732
733sub process_assoc_files {
734    my $self = shift (@_);
735    my ($doc_obj, $doc_dir, $handle) = @_;
736
737    my $outhandle = $self->{'output_handle'};
738   
739    my $output_dir = $self->get_output_dir();
740    return if (!defined $output_dir);
741
742    &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
743     
744    my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
745    &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
746
747    my @assoc_files = ();
748    my $filename;;
749
750    my $source_filename = $doc_obj->get_source_filename();
751
752    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
753
754    if (defined $collect_dir) {
755    my $dirsep_regexp = &util::get_os_dirsep();
756
757    if ($collect_dir !~ /$dirsep_regexp$/) {
758        $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
759    }
760
761    # This test is never going to fail on Windows -- is this a problem?
762     
763    if ($source_filename !~ /^$dirsep_regexp/) {
764        $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
765    }
766    }
767
768
769    # set the assocfile path (even if we have no assoc files - need this for lucene)
770    $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
771                     "assocfilepath",
772                     "$doc_dir");
773    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
774    my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
775    $dir = "" unless defined $dir;
776       
777    my $utf8_real_filename = $assoc_file_rec->[0];
778
779    # for some reasons the image associate file has / before the full path
780    $utf8_real_filename =~ s/^\\(.*)/$1/i;
781
782##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
783    my $real_filename = $utf8_real_filename;
784    $real_filename = &util::downgrade_if_dos_filename($real_filename);
785
786    if (&FileUtils::fileExists($real_filename)) {
787
788        $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
789
790            &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
791
792        $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
793                     "gsdlassocfile",
794                     "$afile:$assoc_file_rec->[2]:$dir");
795    } elsif ($self->{'verbosity'} > 1) {
796        print $outhandle "BasePlugout::process couldn't copy the associated file " .
797        "$real_filename to $afile\n";
798    }
799    }
800}
801
802
803sub process_metafiles_metadata
804{
805    my $self = shift (@_);
806    my ($doc_obj) = @_;
807
808    my $top_section = $doc_obj->get_top_section();
809    my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
810
811    foreach my $metafile_pair (@$metafiles) {
812    my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
813
814    $doc_obj->metadata_file($full_metafile,$metafile);
815    }
816
817    $doc_obj->delete_metadata($top_section,"gsdlmetafile");
818}
819
820sub archiveinf_files_to_field
821{
822    my $self = shift(@_);
823    my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
824
825    foreach my $file_rec (@$files) {
826    my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
827    my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
828    # for some reasons the image associate file has / before the full path
829    $real_filename =~ s/^\\(.*)/$1/i;
830
831    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
832
833    if (&FileUtils::fileExists($raw_filename)) {
834
835#       if (defined $collect_dir) {
836#       my $collect_dir_re_safe = $collect_dir;
837#       $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
838#       $collect_dir_re_safe =~ s/\./\\./g;##
839
840#       $real_filename =~ s/^$collect_dir_re_safe//;
841#       }
842       
843        if (defined $reverse_lookups) {
844        $reverse_lookups->{$real_filename} = 1;
845        }
846
847        if($field =~ m@assoc-file|src-file|meta-file@) {
848        $raw_filename = &util::abspath_to_placeholders($raw_filename);
849        }
850
851###     push(@{$oid_files->{$field}},$full_file);       
852        push(@{$oid_files->{$field}},$raw_filename);
853    }
854    else {
855        print STDERR "Warning: archiveinf_files_to_field()\n  $real_filename does not appear to be on the file system\n";
856    }
857    }
858}
859
860sub archiveinf_db
861{
862    my $self = shift (@_);
863    my ($doc_obj) = @_;
864
865    my $verbosity = $self->{'verbosity'};
866
867    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
868    if (defined $collect_dir) {
869    my $dirsep_regexp = &util::get_os_dirsep();
870
871    if ($collect_dir !~ /$dirsep_regexp$/) {
872        # ensure there is a slash at the end
873        $collect_dir .= &util::get_dirsep();
874    }
875    }
876
877    my $oid = $doc_obj->get_OID();
878    my $source_filename = $doc_obj->get_unmodified_source_filename();
879    my $working_info = $self->{'output_info'};
880    my $doc_info = $working_info->get_info($oid);
881
882    my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
883    # doc_file is the path to the archive doc.xml. Make sure it has unix
884    # slashes, then if the collection is copied to linux, it can be built without reimport
885    $doc_file =~ s/\\/\//g;
886    my $oid_files = { 'doc-file' => $doc_file,
887              'index-status' => $index_status,
888              'src-file' => $source_filename,
889              'sort-meta' => $sortmeta,
890              'assoc-file' => [],
891              'meta-file'  => [] };
892    if (defined $group_position) {
893    $oid_files->{'group-position'} = $group_position;
894    }
895    my $reverse_lookups = { $source_filename => "1" };
896
897
898    $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
899                     $collect_dir,$oid_files,$reverse_lookups);
900
901
902    $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
903                     $collect_dir,$oid_files);
904
905    # Get the infodbtype value for this collection from the arcinfo object
906    my $infodbtype = $self->{'output_info'}->{'infodbtype'};
907    my $output_dir = $self->{'output_dir'};
908
909    my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
910
911    ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
912
913    if (!$self->{'no_rss'})
914    {
915      if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
916    my $top_section = $doc_obj->get_top_section();
917   
918    # rss_title can be set in collect.cfg as follows:
919    #      plugout GreenstoneXMLPlugout -rss_title "dc.Title; ex.Title"
920    # rss_title is a semi-colon or comma-separated list of the metadata field names that should
921    # be consulted in order to obtain a Title (anchor text) for the RSS document link.   
922    # If not specified, rss_title will default to dc.Title, and fall back on Untitled
923    my $metafieldnames = $self->{'rss_title'};
924    my @metafieldarray = split(/[,;] ?/,$metafieldnames); # , or ; separator can be followed by an optional space
925    my $titles;
926    #@$titles=(); # at worst @$titles will be (), as get_metadata(dc.Titles) may return ()
927    foreach my $metafieldname (@metafieldarray) {
928        $metafieldname =~ s@^ex\.@@; # if ex.Title, need to get_metadata() on metafieldname=Title
929        $titles = $doc_obj->get_metadata($top_section,$metafieldname);
930
931        if(scalar(@$titles) != 0) { # found at least one title for one metafieldname
932           last; # break out of the loop
933        }
934    }
935   
936    # if ex.Title was listed in the metafieldnames, then we'll surely have a value for title for this doc
937    # otherwise, if we have no titles at this point, add in a default of Untitled as this doc's title
938    if(scalar(@$titles) == 0) { #&& $metafieldnames !~ m@ex.Title@) {
939        push(@$titles, "Untitled");
940    }
941   
942    # encode basic html entities like <>"& in the title(s), since the & char can break RSS links
943    for (my $i = 0; $i < scalar(@$titles); $i++) {
944        &ghtml::htmlsafe(@$titles[$i]);
945    }
946
947    my $dc_title = join("; ", @$titles);
948
949    if ($oid_files->{'index-status'} eq "R") {
950        $dc_title .= " (Updated)";
951    }
952
953        my $rss_entry = "<item>\n";
954        $rss_entry   .= "   <title>$dc_title</title>\n";
955    if(&util::is_gs3()) {
956        $rss_entry   .= "   <link>_httpdomain__httpcollection_/document/$oid</link>\n";
957    } else {
958        $rss_entry   .= "   <link>_httpdomainHtmlsafe__httpcollection_/document/$oid</link>\n";
959    }
960    $rss_entry   .= "</item>";
961
962        if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
963        {
964          my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
965          my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
966          &dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
967          &dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
968        }
969        else
970        {
971          my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
972          my $rss_fh;
973          if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh, "utf8"))
974          {
975        print $rss_fh $rss_entry . "\n";
976        &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
977          }
978          else
979          {
980        print STDERR "**** Failed to open $rss_filename\n$!\n";
981          }
982        }
983      }
984    }
985
986    $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
987    $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
988    $oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
989    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
990    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
991    if (defined $oid_files->{'group-position'}) {
992    $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
993    }
994
995    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
996    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
997    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
998
999    foreach my $rl (keys %$reverse_lookups) {
1000    $working_info->add_reverseinfo($rl,$oid);
1001    } 
1002
1003    # meta files not set in reverse entry, but need to set the metadata flag
1004    if (defined $doc_obj->get_meta_files()) {
1005    foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
1006        my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
1007        $working_info->set_meta_file_flag($full_file);
1008    }
1009    }
1010}
1011
1012# This sub is called for every metadata.xml accepted for processing by by MetdataXMLPlugin
1013# and adds an entry into archiveinf-src.db for that file in the form:
1014#   [@THISCOLLECTPATH@/import/metadata.xml]
1015#   <meta-file>1
1016# This prevents blind reprocessing of the same old docs upon *incremental* building whenever
1017# we encounter a default empty metadata.xml that has no actual <FileSet> content defined.
1018sub add_metaxml_file_entry_to_archiveinfsrc {
1019    my $self = shift (@_);
1020    my ($full_file) = @_;
1021   
1022    print STDERR "**** Adding metaxml file entry for full_file: $full_file\n";
1023    my $working_info = $self->{'output_info'};
1024    $working_info->set_meta_file_flag($full_file);
1025}
1026
1027
1028sub set_sortmeta {
1029    my $self = shift (@_);
1030    my ($sortmeta, $removeprefix, $removesuffix) = @_;
1031   
1032    $self->{'sortmeta'} = $sortmeta;
1033    if (defined ($removeprefix) && $removeprefix ) {
1034    $removeprefix =~ s/^\^//; # don't need a leading ^
1035    $self->{'removeprefix'} = $removeprefix;
1036    }
1037    if (defined ($removesuffix) && $removesuffix) {
1038    $removesuffix =~ s/\$$//; # don't need a trailing $
1039    $self->{'removesuffix'} = $removesuffix;
1040    }
1041}
1042
1043
1044
1045sub open_xslt_pipe
1046{
1047    my $self = shift @_;
1048    my ($output_file_name, $xslt_file)=@_;
1049
1050    return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1051   
1052    my $java_class_path =  &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1053
1054    my $mapping_file_path = "";
1055
1056    if ($ENV{'GSDLOS'} eq "windows"){
1057    $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1058    # this file:/// bit didn't work for me on windows XP
1059    #$xslt_file = "\"file:///".$xslt_file."\"";
1060    #$mapping_file_path = "\"file:///";
1061    }
1062    else{
1063    $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1064    }
1065
1066
1067    $java_class_path = "\"".$java_class_path."\"";
1068
1069    my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1070
1071    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1072    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1073    $cmd .= "-m $mapping_file_path";
1074    }
1075   
1076    open(*XMLWRITER, $cmd)
1077    or die "can't open pipe to xslt: $!";
1078
1079   
1080    $self->{'xslt_writer'} = *XMLWRITER;
1081
1082    print XMLWRITER "<?DocStart?>\n";       
1083    print XMLWRITER "$output_file_name\n";
1084
1085 
1086  }
1087 
1088
1089sub close_xslt_pipe
1090{
1091  my $self = shift @_;
1092
1093 
1094  return unless defined $self->{'xslt_writer'} ;
1095   
1096  my $xsltwriter = $self->{'xslt_writer'};
1097 
1098  print $xsltwriter "<?DocEnd?>\n";
1099  close($xsltwriter);
1100
1101  undef $self->{'xslt_writer'};
1102
1103}
1104
1105
1106
1107#the subclass should implement this method if is_group method could return 1.
1108sub close_group_output{
1109   my $self = shift (@_);       
1110}
1111
1112sub is_group {
1113    my $self = shift (@_);
1114    return 0;       
1115}
1116
1117my $dc_set = { Title => 1,       
1118           Creator => 1,
1119           Subject => 1,
1120           Description => 1,
1121           Publisher => 1,
1122           Contributor => 1,
1123           Date => 1,
1124           Type => 1,
1125           Format => 1,
1126           Identifier => 1,
1127           Source => 1,
1128           Language => 1,
1129           Relation => 1,
1130           Coverage => 1,
1131           Rights => 1};
1132
1133
1134# returns an XML representation of the dublin core metadata
1135# if dc meta is not found, try ex meta
1136# This method is not used by the DSpacePlugout, which has its
1137# own method to save its dc metadata
1138sub get_dc_metadata {
1139    my $self = shift(@_);
1140    my ($doc_obj, $section, $version) = @_;
1141   
1142    # build up string of dublin core metadata
1143    $section="" unless defined $section;
1144   
1145    my $section_ptr = $doc_obj->_lookup_section($section);
1146    return "" unless defined $section_ptr;
1147
1148
1149    my $explicit_dc = {};
1150    my $explicit_ex_dc = {};
1151    my $explicit_ex = {};
1152
1153    my $all_text="";
1154   
1155    # We want high quality dc metadata to go in first, so we store all the
1156    # assigned dc.* values first. Then, for all those dc metadata names in
1157    # the official dc set that are as yet unassigned, we look to see whether
1158    # embedded ex.dc.* metadata has defined some values for them. If not,
1159    # then for the same missing dc metadata names, we look in ex metadata.
1160
1161    foreach my $data (@{$section_ptr->{'metadata'}}){
1162    my $escaped_value = &docprint::escape_text($data->[1]);
1163    if ($data->[0]=~ m/^dc\./) {
1164        $data->[0] =~ tr/[A-Z]/[a-z]/;
1165
1166        $data->[0] =~ m/^dc\.(.*)/;
1167        my $dc_element =  $1;
1168
1169        if (!defined $explicit_dc->{$dc_element}) {
1170        $explicit_dc->{$dc_element} = [];
1171        }
1172        push(@{$explicit_dc->{$dc_element}},$escaped_value);
1173
1174        if (defined $version && ($version eq "oai_dc")) {
1175        $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1176        }
1177        else {
1178        # qualifier???
1179        $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1180        }
1181
1182    } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1183        $data->[0] =~ m/^ex\.dc\.(.*)/;
1184        my $ex_dc_element = $1;
1185        my $lc_ex_dc_element = lc($ex_dc_element);
1186
1187        # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1188        if (defined $dc_set->{$ex_dc_element}) {
1189        if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1190            $explicit_ex_dc->{$lc_ex_dc_element} = [];
1191        }
1192        push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1193        }
1194    }
1195    elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1196        $data->[0] =~ m/^(ex\.)?(.*)/;
1197        my $ex_element = $2;
1198        my $lc_ex_element = lc($ex_element);
1199
1200        if (defined $dc_set->{$ex_element}) {
1201        if (!defined $explicit_ex->{$lc_ex_element}) {
1202            $explicit_ex->{$lc_ex_element} = [];
1203        }
1204        push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1205        }
1206    }
1207    }
1208
1209    # go through dc_set and for any element *not* defined in explicit_dc
1210    # that does exist in explicit_ex, add it in as metadata
1211    foreach my $k ( keys %$dc_set ) {
1212    my $lc_k = lc($k);
1213
1214    if (!defined $explicit_dc->{$lc_k}) {
1215        # try to find if ex.dc.* defines this dc.* meta,
1216        # if not, then look for whether there's an ex.* equivalent
1217
1218        if (defined $explicit_ex_dc->{$lc_k}) {
1219        foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1220            my $dc_element    = $lc_k;
1221            my $escaped_value = $v;
1222           
1223            if (defined $version && ($version eq "oai_dc")) {
1224            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1225            }
1226            else {
1227            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1228            }           
1229        }
1230        } elsif (defined $explicit_ex->{$lc_k}) {
1231        foreach my $v (@{$explicit_ex->{$lc_k}}) {
1232            my $dc_element    = $lc_k;
1233            my $escaped_value = $v;
1234
1235            if (defined $version && ($version eq "oai_dc")) {
1236            $all_text .= "   <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1237            }
1238            else {
1239            $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1240            }
1241        }
1242        }
1243    }
1244    }
1245
1246    if ($all_text eq "") {
1247    $all_text .= "   There is no Dublin Core metatdata in this document\n";
1248    }   
1249    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1250
1251    return $all_text;
1252}
1253
1254# Build up dublin_core metadata.  Priority given to dc.* over ex.*
1255# This method was apparently added by Jeffrey and committed by Shaoqun.
1256# But we don't know why it was added, so not using it anymore.
1257sub new_get_dc_metadata {
1258   
1259    my $self = shift(@_);
1260    my ($doc_obj, $section, $version) = @_;
1261
1262    # build up string of dublin core metadata
1263    $section="" unless defined $section;
1264   
1265    my $section_ptr=$doc_obj->_lookup_section($section);
1266    return "" unless defined $section_ptr;
1267
1268    my $all_text = "";
1269    foreach my $data (@{$section_ptr->{'metadata'}}){
1270    my $escaped_value = &docprint::escape_text($data->[1]);
1271    my $dc_element =  $data->[0];
1272   
1273    my @array = split('\.',$dc_element);
1274    my ($type,$name);
1275
1276    if(defined $array[1])
1277    {
1278        $type = $array[0];
1279        $name = $array[1];
1280    }
1281    else
1282    {
1283        $type = "ex";
1284        $name = $array[0];
1285    }
1286   
1287    $all_text .= '   <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1288    }
1289    return $all_text;
1290}
1291
1292
12931;
Note: See TracBrowser for help on using the browser.