source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 28550

Last change on this file since 28550 was 28550, checked in by kjdon, 10 years ago

output in utf8. text is now proper unicode internally. Copied this line from the GreenstoneXMLplugout saveas method.

  • Property svn:keywords set to Author Date Id Revision
File size: 37.7 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use util;
39use FileUtils;
40
41# suppress the annoying "subroutine redefined" warning that various
42# gets cause under perl 5.6
43$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
44
45my $arguments = [
46 { 'name' => "group_size",
47 'desc' => "{BasPlugout.group_size}",
48 'type' => "int",
49 'deft' => "1",
50 'reqd' => "no",
51 'hiddengli' => "no"},
52 { 'name' => "output_info",
53 'desc' => "{BasPlugout.output_info}",
54 'type' => "string",
55 'reqd' => "yes",
56 'hiddengli' => "yes"},
57 { 'name' => "xslt_file",
58 'desc' => "{BasPlugout.xslt_file}",
59 'type' => "string",
60 'reqd' => "no",
61 'deft' => "",
62 'hiddengli' => "no"},
63 { 'name' => "output_handle",
64 'desc' => "{BasPlugout.output_handle}",
65 'type' => "string",
66 'deft' => 'STDERR',
67 'reqd' => "no",
68 'hiddengli' => "yes"},
69 { 'name' => "verbosity",
70 'desc' => "{BasPlugout.verbosity}",
71 'type' => "int",
72 'deft' => "0",
73 'reqd' => "no",
74 'hiddengli' => "no"},
75 { 'name' => "gzip_output",
76 'desc' => "{BasPlugout.gzip_output}",
77 'type' => "flag",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "debug",
81 'desc' => "{BasPlugout.debug}",
82 'type' => "flag",
83 'reqd' => "no",
84 'hiddengli' => "yes"},
85 { 'name' => "subdir_split_length",
86 'desc' => "{BasPlugout.subdir_split_length}",
87 'type' => "int",
88 'reqd' => "no",
89 'deft' => "8",
90 'hiddengli' => "no"},
91 { 'name' => "subdir_hash_prefix",
92 'desc' => "{BasPlugout.subdir_hash_prefix}",
93 'type' => "flag",
94 'reqd' => "no",
95 'deft' => "0",
96 'hiddengli' => "no"},
97 { 'name' => 'no_rss',
98 'desc' => '{BasPlugout.no_rss}',
99 'type' => 'flag',
100 'reqd' => 'no',
101 'hiddengli' => 'yes'}
102];
103
104my $options = { 'name' => "BasePlugout",
105 'desc' => "{BasPlugout.desc}",
106 'abstract' => "yes",
107 'inherits' => "no",
108 'args' => $arguments};
109
110sub new
111{
112 my $class = shift (@_);
113
114 my ($plugoutlist,$args,$hashArgOptLists) = @_;
115 push(@$plugoutlist, $class);
116
117 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
118
119 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120 push(@{$hashArgOptLists->{"OptList"}},$options);
121
122 my $self = {};
123 $self->{'plugout_type'} = $class;
124 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
125 $self->{"info_only"} = 0;
126
127 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
128 # the args, just return the object.
129 foreach my $strArg (@{$args})
130 {
131 if(defined $strArg && $strArg eq "-gsdlinfo")
132 {
133 $self->{"info_only"} = 1;
134 return bless $self, $class;
135 }
136 }
137
138 delete $self->{"info_only"};
139
140 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
141 {
142 my $classTempClass = bless $self, $class;
143 print STDERR "<BadPlugout d=$plugout_name>\n";
144 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
145 $classTempClass->print_txt_usage(""); # Use default resource bundle
146 die "\n";
147 }
148
149
150 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
151 {
152 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
153 if (!defined $full_file_path) {
154 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
155 die "\n";
156 }
157 $self->{'xslt_file'} = $full_file_path;
158 }
159
160 $self->{'gs_count'} = 0;
161
162 $self->{'keep_import_structure'} = 0;
163
164 return bless $self, $class;
165
166}
167
168sub print_xml_usage
169{
170 my $self = shift(@_);
171 my $header = shift(@_);
172 my $high_level_information_only = shift(@_);
173
174 # XML output is always in UTF-8
175 gsprintf::output_strings_in_UTF8;
176
177 if ($header) {
178 &PrintUsage::print_xml_header("plugout");
179 }
180 $self->print_xml($high_level_information_only);
181}
182
183
184sub print_xml
185{
186 my $self = shift(@_);
187 my $high_level_information_only = shift(@_);
188
189 my $optionlistref = $self->{'option_list'};
190 my @optionlist = @$optionlistref;
191 my $plugoutoptions = shift(@$optionlistref);
192 return if (!defined($plugoutoptions));
193
194 gsprintf(STDERR, "<PlugoutInfo>\n");
195 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
196 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
197 $desc =~ s/</&amp;lt;/g; # doubly escaped
198 $desc =~ s/>/&amp;gt;/g;
199 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
200 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
201 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
202 unless (defined($high_level_information_only)) {
203 gsprintf(STDERR, " <Arguments>\n");
204 if (defined($plugoutoptions->{'args'})) {
205 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
206 }
207 gsprintf(STDERR, " </Arguments>\n");
208
209 # Recurse up the plugout hierarchy
210 $self->print_xml();
211 }
212 gsprintf(STDERR, "</PlugoutInfo>\n");
213}
214
215
216sub print_txt_usage
217{
218 my $self = shift(@_);
219
220 # Print the usage message for a plugout (recursively)
221 my $descoffset = $self->determine_description_offset(0);
222 $self->print_plugout_usage($descoffset, 1);
223}
224
225sub determine_description_offset
226{
227 my $self = shift(@_);
228 my $maxoffset = shift(@_);
229
230 my $optionlistref = $self->{'option_list'};
231 my @optionlist = @$optionlistref;
232 my $plugoutoptions = pop(@$optionlistref);
233 return $maxoffset if (!defined($plugoutoptions));
234
235 # Find the length of the longest option string of this download
236 my $plugoutargs = $plugoutoptions->{'args'};
237 if (defined($plugoutargs)) {
238 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
239 if ($longest > $maxoffset) {
240 $maxoffset = $longest;
241 }
242 }
243
244 # Recurse up the download hierarchy
245 $maxoffset = $self->determine_description_offset($maxoffset);
246 $self->{'option_list'} = \@optionlist;
247 return $maxoffset;
248}
249
250
251sub print_plugout_usage
252{
253 my $self = shift(@_);
254 my $descoffset = shift(@_);
255 my $isleafclass = shift(@_);
256
257 my $optionlistref = $self->{'option_list'};
258 my @optionlist = @$optionlistref;
259 my $plugoutoptions = shift(@$optionlistref);
260 return if (!defined($plugoutoptions));
261
262 my $plugoutname = $plugoutoptions->{'name'};
263 my $plugoutargs = $plugoutoptions->{'args'};
264 my $plugoutdesc = $plugoutoptions->{'desc'};
265
266 # Produce the usage information using the data structure above
267 if ($isleafclass) {
268 if (defined($plugoutdesc)) {
269 gsprintf(STDERR, "$plugoutdesc\n\n");
270 }
271 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
272 }
273
274 # Display the download options, if there are some
275 if (defined($plugoutargs)) {
276 # Calculate the column offset of the option descriptions
277 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
278
279 if ($isleafclass) {
280 gsprintf(STDERR, " {common.specific_options}:\n");
281 }
282 else {
283 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
284 }
285
286 # Display the download options
287 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
288 }
289
290 # Recurse up the download hierarchy
291 $self->print_plugout_usage($descoffset, 0);
292 $self->{'option_list'} = \@optionlist;
293}
294
295
296sub error
297{
298 my ($strFunctionName,$strError) = @_;
299 {
300 print "Error occoured in BasePlugout.pm\n".
301 "In Function: ".$strFunctionName."\n".
302 "Error Message: ".$strError."\n";
303 exit(-1);
304 }
305}
306
307# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
308sub set_OIDtype {
309 my $self = shift (@_);
310 my ($type, $metadata) = @_;
311
312 if ($type =~ /^(hash|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
313 $self->{'OIDtype'} = $type;
314 } else {
315 $self->{'OIDtype'} = "hash";
316 }
317 if ($type =~ /^assigned$/) {
318 if (defined $metadata) {
319 $self->{'OIDmetadata'} = $metadata;
320 } else {
321 $self->{'OIDmetadata'} = "dc.Identifier";
322 }
323 }
324}
325
326sub set_output_dir
327{
328 my $self = shift @_;
329 my ($output_dir) = @_;
330
331 $self->{'output_dir'} = $output_dir;
332}
333
334sub setoutputdir
335{
336 my $self = shift @_;
337 my ($output_dir) = @_;
338
339 $self->{'output_dir'} = $output_dir;
340}
341
342sub get_output_dir
343{
344 my $self = shift (@_);
345
346 return $self->{'output_dir'};
347}
348
349sub getoutputdir
350{
351 my $self = shift (@_);
352
353 return $self->{'output_dir'};
354}
355
356sub getoutputinfo
357{
358 my $self = shift (@_);
359
360 return $self->{'output_info'};
361}
362
363
364sub get_output_handler
365{
366 my $self = shift (@_);
367
368 my ($output_file_name) = @_;
369
370 my $fh;
371 &FileUtils::openFileHandle($output_file_name, '>', \$fh) or die('Can not open a file handler for: ' . $output_file_name . "\n");
372
373 return $fh;
374}
375
376sub release_output_handler
377{
378 my $self = shift (@_);
379 my ($outhandler) = @_;
380
381 close($outhandler);
382
383}
384
385sub output_xml_header {
386 my $self = shift (@_);
387 my ($handle,$docroot,$nondoctype) = @_;
388
389
390 #print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
391
392 #For Dspace must be UTF in lower case
393 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
394
395 if (!defined $nondoctype){
396 my $doctype = (defined $docroot) ? $docroot : "Section";
397
398 # Used to be '<!DOCTYPE Archive SYSTEM ...'
399
400 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
401 }
402
403 print $handle "<$docroot>\n" if defined $docroot;
404}
405
406sub output_xml_footer {
407 my $self = shift (@_);
408 my ($handle,$docroot) = @_;
409 print $handle "</$docroot>\n" if defined $docroot;
410}
411
412
413sub output_general_xml_header
414{
415 my $self = shift (@_);
416 my ($handle,$docroot,$opt_attributes,$opt_dtd, $opt_doctype) = @_;
417
418 print $handle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
419
420 if (defined $opt_dtd) {
421 my $doctype = (defined $opt_doctype) ? $opt_doctype : $docroot;
422 print $handle "<!DOCTYPE $doctype SYSTEM \"$opt_dtd\">\n";
423 }
424
425 if (defined $docroot) {
426 my $full_docroot = $docroot;
427 if (defined $opt_attributes) {
428 $full_docroot .= " $opt_attributes";
429 }
430
431 print $handle "<$full_docroot>\n"
432 }
433}
434
435sub output_general_xml_footer
436{
437 output_xml_footer(@_);
438}
439
440
441sub process {
442 my $self = shift (@_);
443 my ($doc_obj) = @_;
444
445 # for OAI purposes
446 $doc_obj->set_lastmodified();
447 $doc_obj->set_oailastmodified();
448
449 if ($self->{'group_size'} > 1) {
450 $self->group_process ($doc_obj);
451 return;
452 }
453
454 my $OID = $doc_obj->get_OID();
455 $OID = "NULL" unless defined $OID;
456
457 my $top_section = $doc_obj->get_top_section();
458
459 #get document's directory
460 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
461
462 my $output_info = $self->{'output_info'};
463 return if (!defined $output_info);
464
465 ##############################
466 # call subclass' saveas method
467 ##############################
468 $self->saveas($doc_obj,$doc_dir);
469 $self->archiveinf_db($doc_obj,$doc_dir);
470
471}
472
473sub store_output_info_reference {
474 my $self = shift (@_);
475 my ($doc_obj) = @_;
476
477 my $output_info = $self->{'output_info'};
478 my $metaname = $self->{'sortmeta'};
479
480 if (!defined $metaname || $metaname !~ /\S/) {
481 my $OID = $doc_obj->get_OID();
482 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "");
483 return;
484 }
485
486 if ($metaname eq "OID") { # sort by OID
487 my $OID = $doc_obj->get_OID();
488 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID);
489 return;
490 }
491
492 my $metadata = "";
493 my $top_section = $doc_obj->get_top_section();
494
495 my @commameta_list = split(/,/, $metaname);
496 foreach my $cmn (@commameta_list) {
497 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
498 if ($meta) {
499 # do remove prefix/suffix - this will apply to all values
500 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
501 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
502 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
503 $metadata .= $meta if ($meta);
504 }
505 }
506
507 # store reference in the output_info
508 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
509
510}
511
512sub group_process {
513
514 my $self = shift (@_);
515 my ($doc_obj) = @_;
516
517 my $OID = $doc_obj->get_OID();
518 $OID = "NULL" unless defined $OID;
519
520 my $groupsize = $self->{'group_size'};
521 my $gs_count = $self->{'gs_count'};
522 my $open_new_file = (($gs_count % $groupsize)==0);
523 my $outhandle = $self->{'output_handle'};
524
525 # opening a new file, or document has assoicated files => directory needed
526 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
527
528 # The directory the archive file (doc.xml) and all associated files
529 # should end up in
530 my $doc_dir;
531 # If we've determined its time for a new file, open it now
532 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
533 {
534 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
535 # only if opening new file
536 my $output_dir = $self->get_output_dir();
537 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
538 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
539 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
540
541 if ($gs_count>0)
542 {
543 return if (!$self->close_file_output());
544 }
545
546 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
547
548 binmode(GROUPPROCESS, ":utf8");
549 $self->{'gs_filename'} = $doc_file;
550 $self->{'short_doc_file'} = $short_doc_file;
551 $self->{'gs_OID'} = $OID;
552 $self->{'gs_doc_dir'} = $doc_dir;
553
554 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
555 }
556 # Otherwise load the same archive document directory used last time
557 else
558 {
559 $doc_dir = $self->{'gs_doc_dir'};
560 }
561
562 # copy all the associated files, add this information as metadata
563 # to the document
564 print $outhandle "Writing associated files to $doc_dir\n";
565 $self->process_assoc_files ($doc_obj, $doc_dir);
566
567 # look up 'gsdlmetafile' metadata and store that information
568 # explicitly in $doc_obj
569 $self->process_metafiles_metadata ($doc_obj);
570 }
571
572 # save this document
573 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
574 print GROUPPROCESS $section_text;
575
576 $self->{'gs_count'}++;
577}
578
579
580sub saveas {
581 my $self = shift (@_);
582
583 die "Basplug::saveas function must be implemented in sub classes\n";
584}
585
586sub get_doc_dir {
587 my $self = shift (@_);
588 my ($OID, $source_filename) = @_;
589
590 my $working_dir = $self->get_output_dir();
591 my $working_info = $self->{'output_info'};
592 return if (!defined $working_info);
593
594 my $doc_info = $working_info->get_info($OID);
595 my $doc_dir = '';
596
597 if (defined $doc_info && scalar(@$doc_info) >= 1)
598 {
599 # This OID already has an archives directory, so use it again
600 $doc_dir = $doc_info->[0];
601 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
602 }
603 elsif ($self->{'keep_import_structure'})
604 {
605 $source_filename = &File::Basename::dirname($source_filename);
606 $source_filename =~ s/[\\\/]+/\//g;
607 $source_filename =~ s/\/$//;
608
609 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
610 }
611
612 # We have to use a new archives directory for this document
613 if ($doc_dir eq "")
614 {
615 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
616 }
617
618 if (!defined $self->{'group'} || !$self->{'group'}){
619 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
620 }
621
622 return $doc_dir;
623}
624
625# Before parallel building, this was the method that created the new doc dirs in archives
626sub get_new_doc_dir_OLD {
627 my $self = shift (@_);
628 my($working_info,$working_dir,$OID) = @_;
629
630
631 my $doc_dir = "";
632 my $doc_dir_rest = $OID;
633
634 # remove any \ and / from the OID
635 $doc_dir_rest =~ s/[\\\/]//g;
636
637 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
638 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
639
640 my $doc_dir_num = 0;
641
642 do {
643 $doc_dir .= "/" if $doc_dir_num > 0;
644 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
645 if ($self->{'subdir_hash_prefix'})
646 {
647 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
648 }
649 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
650 if ($doc_dir_rest =~ s/$pattern//i)
651 {
652 $doc_dir .= $1;
653 $doc_dir_num++;
654 }
655 } while ($doc_dir_rest ne "" &&
656 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
657 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
658 my $i = 1;
659 my $doc_dir_base = $doc_dir;
660 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
661 $doc_dir = "$doc_dir_base-$i";
662 $i++;
663 }
664
665 return "$doc_dir.dir";
666}
667
668## @function get_new_doc_dir()
669#
670# Once a doc object is ready to write to disk (and hence has a nice OID),
671# generate a unique subdirectory to write the information to.
672#
673# - create the directory as part of this call, to try and avoid race conditions
674# found in parallel processing [jmt12]
675#
676# @todo figure out what the rule regarding $work_info->size() is meant to do
677#
678# @todo determine what $self->{'group'} is, and whether it should affect
679# directory creation
680#
681sub get_new_doc_dir
682{
683 my $self = shift (@_);
684 my($working_info,$working_dir,$OID) = @_;
685
686 my $doc_dir = "";
687 my $doc_dir_rest = $OID;
688
689 # remove any \ and / from the OID
690 $doc_dir_rest =~ s/[\\\/]//g;
691
692 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
693 if ($ENV{'GSDLOS'} =~ /^windows$/i)
694 {
695 $doc_dir_rest =~ s/\://g;
696 }
697
698 # we generally create a unique directory by adding consequtive fragments of
699 # the document identifier (split by some predefined length - defaulting to
700 # 8) until we find a directory that doesn't yet exist. Note that directories
701 # that contain a document have a suffix ".dir" (whereas those that contain
702 # only subdirectories have no suffix).
703 my $doc_dir_num = 0; # how many directories deep we are
704 my $created_directory = 0; # have we successfully created a new directory
705 do
706 {
707 # (does this work on windows? - jmt12)
708 if ($doc_dir_num > 0)
709 {
710 $doc_dir .= '/';
711 }
712
713 # the default matching pattern grabs the next 'subdir_split_length'
714 # characters of the OID to act as the next subdirectory
715 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
716
717 # Do we count any "HASH" prefix against the split length limit?
718 if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
719 {
720 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
721 }
722
723 # Note the use of 's' to both capture the next chuck of OID and to remove
724 # it from OID at the same time
725 if ($doc_dir_rest =~ s/$pattern//i)
726 {
727 $doc_dir .= $1;
728 $doc_dir_num++;
729
730 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
731 if(!FileUtils::directoryExists($full_doc_dir))
732 {
733 &FileUtils::makeAllDirectories($full_doc_dir);
734 $created_directory = 1;
735 }
736
737 ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
738 ###rint STDERR " - create directory: $full_doc_dir => $created_directory\n";
739 ###rint STDERR " - rest: $doc_dir_rest\n";
740 ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
741 ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
742 }
743 }
744 while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
745
746 # not unique yet? Add on an incremental suffix until we are unique
747 my $i = 1;
748 my $doc_dir_base = $doc_dir;
749 while ($created_directory == 0)
750 {
751 $doc_dir = $doc_dir_base . '-' . $i;
752 $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
753 $i++;
754 }
755
756 # in theory this should never happen
757 if (!$created_directory)
758 {
759 die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
760 }
761
762 return $doc_dir . '.dir';
763}
764## get_new_doc_dir()
765
766
767sub process_assoc_files {
768 my $self = shift (@_);
769 my ($doc_obj, $doc_dir, $handle) = @_;
770
771 my $outhandle = $self->{'output_handle'};
772
773 my $output_dir = $self->get_output_dir();
774 return if (!defined $output_dir);
775
776 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
777
778 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
779 &FileUtils::makeAllDirectories($working_dir) unless &FileUtils::directoryExists($working_dir);
780
781 my @assoc_files = ();
782 my $filename;;
783
784 my $source_filename = $doc_obj->get_source_filename();
785
786 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
787
788 if (defined $collect_dir) {
789 my $dirsep_regexp = &util::get_os_dirsep();
790
791 if ($collect_dir !~ /$dirsep_regexp$/) {
792 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
793 }
794
795 # This test is never going to fail on Windows -- is this a problem?
796
797 if ($source_filename !~ /^$dirsep_regexp/) {
798 $source_filename = &FileUtils::filenameConcatenate($collect_dir, $source_filename);
799 }
800 }
801
802
803 # set the assocfile path (even if we have no assoc files - need this for lucene)
804 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
805 "assocfilepath",
806 "$doc_dir");
807 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
808 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
809 $dir = "" unless defined $dir;
810
811 my $utf8_real_filename = $assoc_file_rec->[0];
812
813 # for some reasons the image associate file has / before the full path
814 $utf8_real_filename =~ s/^\\(.*)/$1/i;
815
816## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
817 my $real_filename = $utf8_real_filename;
818 $real_filename = &util::downgrade_if_dos_filename($real_filename);
819
820 if (&FileUtils::fileExists($real_filename)) {
821
822 $filename = &FileUtils::filenameConcatenate($working_dir, $afile);
823
824 &FileUtils::hardLink($real_filename, $filename, $self->{'verbosity'});
825
826 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
827 "gsdlassocfile",
828 "$afile:$assoc_file_rec->[2]:$dir");
829 } elsif ($self->{'verbosity'} > 1) {
830 print $outhandle "BasePlugout::process couldn't copy the associated file " .
831 "$real_filename to $afile\n";
832 }
833 }
834}
835
836
837sub process_metafiles_metadata
838{
839 my $self = shift (@_);
840 my ($doc_obj) = @_;
841
842 my $top_section = $doc_obj->get_top_section();
843 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
844
845 foreach my $metafile_pair (@$metafiles) {
846 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
847
848 $doc_obj->metadata_file($full_metafile,$metafile);
849 }
850
851 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
852}
853
854sub archiveinf_files_to_field
855{
856 my $self = shift(@_);
857 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
858
859 foreach my $file_rec (@$files) {
860 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
861 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
862 # for some reasons the image associate file has / before the full path
863 $real_filename =~ s/^\\(.*)/$1/i;
864
865 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
866
867 if (&FileUtils::fileExists($raw_filename)) {
868
869# if (defined $collect_dir) {
870# my $collect_dir_re_safe = $collect_dir;
871# $collect_dir_re_safe =~ s/\\/\\\\/g; # use &util::filename_to_regex()
872# $collect_dir_re_safe =~ s/\./\\./g;##
873
874# $real_filename =~ s/^$collect_dir_re_safe//;
875# }
876
877 if (defined $reverse_lookups) {
878 $reverse_lookups->{$real_filename} = 1;
879 }
880
881 if($field =~ m@assoc-file|src-file|meta-file@) {
882 $raw_filename = &util::abspath_to_placeholders($raw_filename);
883 }
884
885### push(@{$oid_files->{$field}},$full_file);
886 push(@{$oid_files->{$field}},$raw_filename);
887 }
888 else {
889 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
890 }
891 }
892}
893
894sub archiveinf_db
895{
896 my $self = shift (@_);
897 my ($doc_obj) = @_;
898
899 my $verbosity = $self->{'verbosity'};
900
901 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
902 if (defined $collect_dir) {
903 my $dirsep_regexp = &util::get_os_dirsep();
904
905 if ($collect_dir !~ /$dirsep_regexp$/) {
906 # ensure there is a slash at the end
907 $collect_dir .= &util::get_dirsep();
908 }
909 }
910
911 my $oid = $doc_obj->get_OID();
912 my $source_filename = $doc_obj->get_unmodified_source_filename();
913 my $working_info = $self->{'output_info'};
914 my $doc_info = $working_info->get_info($oid);
915
916 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
917 # doc_file is the path to the archive doc.xml. Make sure it has unix
918 # slashes, then if the collection is copied to linux, it can be built without reimport
919 $doc_file =~ s/\\/\//g;
920 my $oid_files = { 'doc-file' => $doc_file,
921 'index-status' => $index_status,
922 'src-file' => $source_filename,
923 'sort-meta' => $sortmeta,
924 'assoc-file' => [],
925 'meta-file' => [] };
926
927 my $reverse_lookups = { $source_filename => "1" };
928
929
930 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
931 $collect_dir,$oid_files,$reverse_lookups);
932
933
934 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
935 $collect_dir,$oid_files);
936
937 # Get the infodbtype value for this collection from the arcinfo object
938 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
939 my $output_dir = $self->{'output_dir'};
940
941 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
942
943 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
944
945 if (!$self->{'no_rss'})
946 {
947 if (($oid_files->{'index-status'} eq "I") || ($oid_files->{'index-status'} eq "R")) {
948 my $top_section = $doc_obj->get_top_section();
949
950 my $dc_titles = $doc_obj->get_metadata($top_section,"dc.Title");
951 my $dc_title = join("; ", @$dc_titles);
952
953 if ($oid_files->{'index-status'} eq "R") {
954 $dc_title .= " (Updated)";
955 }
956
957 my $rss_entry = "<item>\n";
958 $rss_entry .= " <title>$dc_title</title>\n";
959 $rss_entry .= " <link>_httpdomain__httpcollection_/document/$oid</link>\n";
960 $rss_entry .= "</item>";
961
962 if (defined(&dbutil::supportsRSS) && &dbutil::supportsRSS($infodbtype))
963 {
964 my $rss_db = &dbutil::get_infodb_file_path($infodbtype, 'rss-items', $output_dir);
965 my $rss_db_fh = &dbutil::open_infodb_write_handle($infodbtype, $rss_db, 'append');
966 &dbutil::write_infodb_rawentry($infodbtype, $rss_db_fh, $oid, $rss_entry);
967 &dbutil::close_infodb_write_handle($infodbtype, $rss_db_fh);
968 }
969 else
970 {
971 my $rss_filename = &FileUtils::filenameConcatenate($output_dir,"rss-items.rdf");
972 my $rss_fh;
973 if (&FileUtils::openFileHandle($rss_filename, '>>', \$rss_fh))
974 {
975 print $rss_fh $rss_entry . "\n";
976 &FileUtils::closeFileHandle($rss_filename, \$rss_fh);
977 }
978 else
979 {
980 print STDERR "**** Failed to open $rss_filename\n$!\n";
981 }
982 }
983 }
984 }
985
986 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
987 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
988 $oid_files->{'src-file'} = &util::abspath_to_placeholders($oid_files->{'src-file'});
989 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
990 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
991
992 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
993 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
994 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
995
996 foreach my $rl (keys %$reverse_lookups) {
997 $working_info->add_reverseinfo($rl,$oid);
998 }
999
1000 # meta files not set in reverese entry, but need to set the metadata flag
1001 if (defined $doc_obj->get_meta_files()) {
1002 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
1003 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
1004 $working_info->set_meta_file_flag($full_file);
1005 }
1006 }
1007}
1008
1009
1010sub set_sortmeta {
1011 my $self = shift (@_);
1012 my ($sortmeta, $removeprefix, $removesuffix) = @_;
1013
1014 $self->{'sortmeta'} = $sortmeta;
1015 if (defined ($removeprefix) && $removeprefix ) {
1016 $removeprefix =~ s/^\^//; # don't need a leading ^
1017 $self->{'removeprefix'} = $removeprefix;
1018 }
1019 if (defined ($removesuffix) && $removesuffix) {
1020 $removesuffix =~ s/\$$//; # don't need a trailing $
1021 $self->{'removesuffix'} = $removesuffix;
1022 }
1023}
1024
1025sub open_xslt_pipe
1026{
1027 my $self = shift @_;
1028 my ($output_file_name, $xslt_file)=@_;
1029
1030 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
1031
1032 my $java_class_path = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
1033
1034 my $mapping_file_path = "";
1035
1036 if ($ENV{'GSDLOS'} eq "windows"){
1037 $java_class_path .=";".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1038 # this file:/// bit didn't work for me on windows XP
1039 #$xslt_file = "\"file:///".$xslt_file."\"";
1040 #$mapping_file_path = "\"file:///";
1041 }
1042 else{
1043 $java_class_path .=":".&FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
1044 }
1045
1046
1047 $java_class_path = "\"".$java_class_path."\"";
1048
1049 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
1050
1051 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
1052 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
1053 $cmd .= "-m $mapping_file_path";
1054 }
1055
1056 open(*XMLWRITER, $cmd)
1057 or die "can't open pipe to xslt: $!";
1058
1059
1060 $self->{'xslt_writer'} = *XMLWRITER;
1061
1062 print XMLWRITER "<?DocStart?>\n";
1063 print XMLWRITER "$output_file_name\n";
1064
1065
1066 }
1067
1068
1069sub close_xslt_pipe
1070{
1071 my $self = shift @_;
1072
1073
1074 return unless defined $self->{'xslt_writer'} ;
1075
1076 my $xsltwriter = $self->{'xslt_writer'};
1077
1078 print $xsltwriter "<?DocEnd?>\n";
1079 close($xsltwriter);
1080
1081 undef $self->{'xslt_writer'};
1082
1083}
1084
1085sub close_file_output
1086{
1087 my ($self) = @_;
1088
1089 # make sure that the handle has been opened - it won't be if we failed
1090 # to import any documents...
1091 if (defined(fileno(GROUPPROCESS))) {
1092 $self->output_xml_footer('GROUPPROCESS','Archive');
1093 close GROUPPROCESS;
1094 }
1095
1096 my $OID = $self->{'gs_OID'};
1097 my $short_doc_file = $self->{'short_doc_file'};
1098
1099 if ($self->{'gzip'}) {
1100 my $doc_file = $self->{'gs_filename'};
1101 `gzip $doc_file`;
1102 $doc_file .= ".gz";
1103 $short_doc_file .= ".gz";
1104 if (!&FileUtils::fileExists($doc_file)) {
1105 my $outhandle = $self->{'output_handle'};
1106 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
1107 return 0;
1108 }
1109 }
1110
1111 # store reference in output_info
1112 my $output_info = $self->{'output_info'};
1113 return 0 if (!defined $output_info);
1114 $output_info->add_info($OID, $short_doc_file, undef, undef);
1115 return 1;
1116}
1117
1118
1119#the subclass should implement this method if is_group method could return 1.
1120sub close_group_output{
1121 my $self = shift (@_);
1122}
1123
1124sub is_group {
1125 my $self = shift (@_);
1126 return 0;
1127}
1128
1129my $dc_set = { Title => 1,
1130 Creator => 1,
1131 Subject => 1,
1132 Description => 1,
1133 Publisher => 1,
1134 Contributor => 1,
1135 Date => 1,
1136 Type => 1,
1137 Format => 1,
1138 Identifier => 1,
1139 Source => 1,
1140 Language => 1,
1141 Relation => 1,
1142 Coverage => 1,
1143 Rights => 1};
1144
1145
1146# returns an XML representation of the dublin core metadata
1147# if dc meta is not found, try ex meta
1148# This method is not used by the DSpacePlugout, which has its
1149# own method to save its dc metadata
1150sub get_dc_metadata {
1151 my $self = shift(@_);
1152 my ($doc_obj, $section, $version) = @_;
1153
1154 # build up string of dublin core metadata
1155 $section="" unless defined $section;
1156
1157 my $section_ptr = $doc_obj->_lookup_section($section);
1158 return "" unless defined $section_ptr;
1159
1160
1161 my $explicit_dc = {};
1162 my $explicit_ex_dc = {};
1163 my $explicit_ex = {};
1164
1165 my $all_text="";
1166
1167 # We want high quality dc metadata to go in first, so we store all the
1168 # assigned dc.* values first. Then, for all those dc metadata names in
1169 # the official dc set that are as yet unassigned, we look to see whether
1170 # embedded ex.dc.* metadata has defined some values for them. If not,
1171 # then for the same missing dc metadata names, we look in ex metadata.
1172
1173 foreach my $data (@{$section_ptr->{'metadata'}}){
1174 my $escaped_value = &docprint::escape_text($data->[1]);
1175 if ($data->[0]=~ m/^dc\./) {
1176 $data->[0] =~ tr/[A-Z]/[a-z]/;
1177
1178 $data->[0] =~ m/^dc\.(.*)/;
1179 my $dc_element = $1;
1180
1181 if (!defined $explicit_dc->{$dc_element}) {
1182 $explicit_dc->{$dc_element} = [];
1183 }
1184 push(@{$explicit_dc->{$dc_element}},$escaped_value);
1185
1186 if (defined $version && ($version eq "oai_dc")) {
1187 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1188 }
1189 else {
1190 # qualifier???
1191 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1192 }
1193
1194 } elsif ($data->[0]=~ m/^ex\.dc\./) { # now look through ex.dc.* to fill in as yet unassigned fields in dc metaset
1195 $data->[0] =~ m/^ex\.dc\.(.*)/;
1196 my $ex_dc_element = $1;
1197 my $lc_ex_dc_element = lc($ex_dc_element);
1198
1199 # only store the ex.dc value for this dc metaname if no dc.* was assigned for it
1200 if (defined $dc_set->{$ex_dc_element}) {
1201 if (!defined $explicit_ex_dc->{$lc_ex_dc_element}) {
1202 $explicit_ex_dc->{$lc_ex_dc_element} = [];
1203 }
1204 push(@{$explicit_ex_dc->{$lc_ex_dc_element}},$escaped_value);
1205 }
1206 }
1207 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) { # look through ex. meta (incl. meta without prefix)
1208 $data->[0] =~ m/^(ex\.)?(.*)/;
1209 my $ex_element = $2;
1210 my $lc_ex_element = lc($ex_element);
1211
1212 if (defined $dc_set->{$ex_element}) {
1213 if (!defined $explicit_ex->{$lc_ex_element}) {
1214 $explicit_ex->{$lc_ex_element} = [];
1215 }
1216 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
1217 }
1218 }
1219 }
1220
1221 # go through dc_set and for any element *not* defined in explicit_dc
1222 # that does exist in explicit_ex, add it in as metadata
1223 foreach my $k ( keys %$dc_set ) {
1224 my $lc_k = lc($k);
1225
1226 if (!defined $explicit_dc->{$lc_k}) {
1227 # try to find if ex.dc.* defines this dc.* meta,
1228 # if not, then look for whether there's an ex.* equivalent
1229
1230 if (defined $explicit_ex_dc->{$lc_k}) {
1231 foreach my $v (@{$explicit_ex_dc->{$lc_k}}) {
1232 my $dc_element = $lc_k;
1233 my $escaped_value = $v;
1234
1235 if (defined $version && ($version eq "oai_dc")) {
1236 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1237 }
1238 else {
1239 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1240 }
1241 }
1242 } elsif (defined $explicit_ex->{$lc_k}) {
1243 foreach my $v (@{$explicit_ex->{$lc_k}}) {
1244 my $dc_element = $lc_k;
1245 my $escaped_value = $v;
1246
1247 if (defined $version && ($version eq "oai_dc")) {
1248 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
1249 }
1250 else {
1251 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1252 }
1253 }
1254 }
1255 }
1256 }
1257
1258 if ($all_text eq "") {
1259 $all_text .= " There is no Dublin Core metatdata in this document\n";
1260 }
1261 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1262
1263 return $all_text;
1264}
1265
1266# Build up dublin_core metadata. Priority given to dc.* over ex.*
1267# This method was apparently added by Jeffrey and committed by Shaoqun.
1268# But we don't know why it was added, so not using it anymore.
1269sub new_get_dc_metadata {
1270
1271 my $self = shift(@_);
1272 my ($doc_obj, $section, $version) = @_;
1273
1274 # build up string of dublin core metadata
1275 $section="" unless defined $section;
1276
1277 my $section_ptr=$doc_obj->_lookup_section($section);
1278 return "" unless defined $section_ptr;
1279
1280 my $all_text = "";
1281 foreach my $data (@{$section_ptr->{'metadata'}}){
1282 my $escaped_value = &docprint::escape_text($data->[1]);
1283 my $dc_element = $data->[0];
1284
1285 my @array = split('\.',$dc_element);
1286 my ($type,$name);
1287
1288 if(defined $array[1])
1289 {
1290 $type = $array[0];
1291 $name = $array[1];
1292 }
1293 else
1294 {
1295 $type = "ex";
1296 $name = $array[0];
1297 }
1298
1299 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1300 }
1301 return $all_text;
1302}
1303
1304
13051;
Note: See TracBrowser for help on using the repository browser.