source: main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm@ 23387

Last change on this file since 23387 was 23387, checked in by davidb, 13 years ago

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.8 KB
Line 
1###########################################################################
2#
3# BasePlugout.pm -- base class for all the plugout modules
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugout;
27
28eval {require bytes};
29
30use strict;
31no strict 'subs';
32no strict 'refs';
33
34use dbutil;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38
39
40# suppress the annoying "subroutine redefined" warning that various
41# gets cause under perl 5.6
42$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
43
44my $arguments = [
45 { 'name' => "group_size",
46 'desc' => "{BasPlugout.group_size}",
47 'type' => "int",
48 'deft' => "1",
49 'reqd' => "no",
50 'hiddengli' => "no"},
51 { 'name' => "output_info",
52 'desc' => "{BasPlugout.output_info}",
53 'type' => "string",
54 'reqd' => "yes",
55 'hiddengli' => "yes"},
56 { 'name' => "xslt_file",
57 'desc' => "{BasPlugout.xslt_file}",
58 'type' => "string",
59 'reqd' => "no",
60 'deft' => "",
61 'hiddengli' => "no"},
62 { 'name' => "output_handle",
63 'desc' => "{BasPlugout.output_handle}",
64 'type' => "string",
65 'deft' => 'STDERR',
66 'reqd' => "no",
67 'hiddengli' => "yes"},
68 { 'name' => "verbosity",
69 'desc' => "{BasPlugout.verbosity}",
70 'type' => "int",
71 'deft' => "0",
72 'reqd' => "no",
73 'hiddengli' => "no"},
74 { 'name' => "gzip_output",
75 'desc' => "{BasPlugout.gzip_output}",
76 'type' => "flag",
77 'reqd' => "no",
78 'hiddengli' => "no"},
79 { 'name' => "debug",
80 'desc' => "{BasPlugout.debug}",
81 'type' => "flag",
82 'reqd' => "no",
83 'hiddengli' => "yes"}
84];
85
86my $options = { 'name' => "BasePlugout",
87 'desc' => "{BasPlugout.desc}",
88 'abstract' => "yes",
89 'inherits' => "no",
90 'args' => $arguments};
91
92sub new
93{
94 my $class = shift (@_);
95
96 my ($plugoutlist,$args,$hashArgOptLists) = @_;
97 push(@$plugoutlist, $class);
98
99 my $plugout_name = (defined $plugoutlist->[0]) ? $plugoutlist->[0] : $class;
100
101 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
102 push(@{$hashArgOptLists->{"OptList"}},$options);
103
104 my $self = {};
105 $self->{'plugout_type'} = $class;
106 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
107 $self->{"info_only"} = 0;
108
109 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
110 # the args, just return the object.
111 foreach my $strArg (@{$args})
112 {
113 if(defined $strArg && $strArg eq "-gsdlinfo")
114 {
115 $self->{"info_only"} = 1;
116 return bless $self, $class;
117 }
118 }
119
120 delete $self->{"info_only"};
121
122 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
123 {
124 my $classTempClass = bless $self, $class;
125 print STDERR "<BadPlugout d=$plugout_name>\n";
126 &gsprintf(STDERR, "\n{BasPlugout.bad_general_option}\n", $plugout_name);
127 $classTempClass->print_txt_usage(""); # Use default resource bundle
128 die "\n";
129 }
130
131
132 if(defined $self->{'xslt_file'} && $self->{'xslt_file'} ne "")
133 {
134 my $full_file_path = &util::locate_config_file($self->{'xslt_file'});
135 if (!defined $full_file_path) {
136 print STDERR "Can not find $self->{'xslt_file'}, please make sure you have supplied the correct file path\n";
137 die "\n";
138 }
139 $self->{'xslt_file'} = $full_file_path;
140 }
141
142 $self->{'gs_count'} = 0;
143
144 $self->{'keep_import_structure'} = 0;
145
146 return bless $self, $class;
147
148}
149
150sub print_xml_usage
151{
152 my $self = shift(@_);
153 my $header = shift(@_);
154 my $high_level_information_only = shift(@_);
155
156 # XML output is always in UTF-8
157 gsprintf::output_strings_in_UTF8;
158
159 if ($header) {
160 &PrintUsage::print_xml_header("plugout");
161 }
162 $self->print_xml($high_level_information_only);
163}
164
165
166sub print_xml
167{
168 my $self = shift(@_);
169 my $high_level_information_only = shift(@_);
170
171 my $optionlistref = $self->{'option_list'};
172 my @optionlist = @$optionlistref;
173 my $plugoutoptions = shift(@$optionlistref);
174 return if (!defined($plugoutoptions));
175
176 gsprintf(STDERR, "<PlugoutInfo>\n");
177 gsprintf(STDERR, " <Name>$plugoutoptions->{'name'}</Name>\n");
178 my $desc = gsprintf::lookup_string($plugoutoptions->{'desc'});
179 $desc =~ s/</&amp;lt;/g; # doubly escaped
180 $desc =~ s/>/&amp;gt;/g;
181 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 gsprintf(STDERR, " <Abstract>$plugoutoptions->{'abstract'}</Abstract>\n");
183 gsprintf(STDERR, " <Inherits>$plugoutoptions->{'inherits'}</Inherits>\n");
184 unless (defined($high_level_information_only)) {
185 gsprintf(STDERR, " <Arguments>\n");
186 if (defined($plugoutoptions->{'args'})) {
187 &PrintUsage::print_options_xml($plugoutoptions->{'args'});
188 }
189 gsprintf(STDERR, " </Arguments>\n");
190
191 # Recurse up the plugout hierarchy
192 $self->print_xml();
193 }
194 gsprintf(STDERR, "</PlugoutInfo>\n");
195}
196
197
198sub print_txt_usage
199{
200 my $self = shift(@_);
201
202 # Print the usage message for a plugout (recursively)
203 my $descoffset = $self->determine_description_offset(0);
204 $self->print_plugout_usage($descoffset, 1);
205}
206
207sub determine_description_offset
208{
209 my $self = shift(@_);
210 my $maxoffset = shift(@_);
211
212 my $optionlistref = $self->{'option_list'};
213 my @optionlist = @$optionlistref;
214 my $plugoutoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($plugoutoptions));
216
217 # Find the length of the longest option string of this download
218 my $plugoutargs = $plugoutoptions->{'args'};
219 if (defined($plugoutargs)) {
220 my $longest = &PrintUsage::find_longest_option_string($plugoutargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the download hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugout_usage
234{
235 my $self = shift(@_);
236 my $descoffset = shift(@_);
237 my $isleafclass = shift(@_);
238
239 my $optionlistref = $self->{'option_list'};
240 my @optionlist = @$optionlistref;
241 my $plugoutoptions = shift(@$optionlistref);
242 return if (!defined($plugoutoptions));
243
244 my $plugoutname = $plugoutoptions->{'name'};
245 my $plugoutargs = $plugoutoptions->{'args'};
246 my $plugoutdesc = $plugoutoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugoutdesc)) {
251 gsprintf(STDERR, "$plugoutdesc\n\n");
252 }
253 gsprintf(STDERR, " {common.usage}: plugout $plugoutname [{common.options}]\n\n");
254 }
255
256 # Display the download options, if there are some
257 if (defined($plugoutargs)) {
258 # Calculate the column offset of the option descriptions
259 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 gsprintf(STDERR, " {common.general_options}:\n", $plugoutname);
266 }
267
268 # Display the download options
269 &PrintUsage::print_options_txt($plugoutargs, $optiondescoffset);
270 }
271
272 # Recurse up the download hierarchy
273 $self->print_plugout_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub error
279{
280 my ($strFunctionName,$strError) = @_;
281 {
282 print "Error occoured in BasePlugout.pm\n".
283 "In Function: ".$strFunctionName."\n".
284 "Error Message: ".$strError."\n";
285 exit(-1);
286 }
287}
288
289# OIDtype may be "hash" or "incremental" or "dirname" or "assigned"
290sub set_OIDtype {
291 my $self = shift (@_);
292 my ($type, $metadata) = @_;
293
294 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
295 $self->{'OIDtype'} = $type;
296 } else {
297 $self->{'OIDtype'} = "hash";
298 }
299 if ($type =~ /^assigned$/) {
300 if (defined $metadata) {
301 $self->{'OIDmetadata'} = $metadata;
302 } else {
303 $self->{'OIDmetadata'} = "dc.Identifier";
304 }
305 }
306}
307
308sub set_output_dir
309{
310 my $self = shift @_;
311 my ($output_dir) = @_;
312
313 $self->{'output_dir'} = $output_dir;
314}
315
316sub setoutputdir
317{
318 my $self = shift @_;
319 my ($output_dir) = @_;
320
321 $self->{'output_dir'} = $output_dir;
322}
323
324sub get_output_dir
325{
326 my $self = shift (@_);
327
328 return $self->{'output_dir'};
329}
330
331sub getoutputdir
332{
333 my $self = shift (@_);
334
335 return $self->{'output_dir'};
336}
337
338sub getoutputinfo
339{
340 my $self = shift (@_);
341
342 return $self->{'output_info'};
343}
344
345
346sub get_output_handler
347{
348 my $self = shift (@_);
349
350 my ($output_file_name) = @_;
351
352 open(*OUTPUT, ">$output_file_name") or die "Can not open a file handler for $output_file_name\n";
353
354 return *OUTPUT;
355}
356
357sub release_output_handler
358{
359 my $self = shift (@_);
360 my ($outhandler) = @_;
361
362 close($outhandler);
363
364}
365
366sub output_xml_header {
367 my $self = shift (@_);
368 my ($handle,$docroot,$nondoctype) = @_;
369
370 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
371
372 if (!defined $nondoctype){
373 my $doctype = (defined $docroot) ? $docroot : "Section";
374
375 # Used to be '<!DOCTYPE Archive SYSTEM ...'
376
377 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
378 }
379
380 print $handle "<$docroot>\n" if defined $docroot;
381}
382
383sub output_xml_footer {
384 my $self = shift (@_);
385 my ($handle,$docroot) = @_;
386 print $handle "</$docroot>\n" if defined $docroot;
387}
388
389sub process {
390 my $self = shift (@_);
391 my ($doc_obj) = @_;
392
393 $doc_obj->set_lastmodified();
394
395 if ($self->{'group_size'} > 1) {
396 $self->group_process ($doc_obj);
397 return;
398 }
399
400 my $OID = $doc_obj->get_OID();
401 $OID = "NULL" unless defined $OID;
402
403 my $top_section = $doc_obj->get_top_section();
404
405 #get document's directory
406 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
407
408 my $output_info = $self->{'output_info'};
409 return if (!defined $output_info);
410
411 ##############################
412 # call subclass' saveas method
413 ##############################
414 $self->saveas($doc_obj,$doc_dir);
415 $self->archiveinf_db($doc_obj,$doc_dir);
416
417}
418
419sub store_output_info_reference {
420 my $self = shift (@_);
421 my ($doc_obj) = @_;
422
423 my $output_info = $self->{'output_info'};
424 my $metaname = $self->{'sortmeta'};
425 if (!defined $metaname || $metaname !~ /\S/) {
426 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, "");
427 return;
428 }
429
430 my $metadata = "";
431 my $top_section = $doc_obj->get_top_section();
432
433 my @commameta_list = split(/,/, $metaname);
434 foreach my $cmn (@commameta_list) {
435 my $meta = $doc_obj->get_metadata_element($top_section, $cmn);
436 if ($meta) {
437 # do remove prefix/suffix - this will apply to all values
438 $meta =~ s/^$self->{'removeprefix'}// if defined $self->{'removeprefix'};
439 $meta =~ s/$self->{'removesuffix'}$// if defined $self->{'removesuffix'};
440 $meta = &sorttools::format_metadata_for_sorting($cmn, $meta, $doc_obj);
441 $metadata .= $meta if ($meta);
442 }
443 }
444
445 # store reference in the output_info
446 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
447
448}
449
450sub group_process {
451
452 my $self = shift (@_);
453 my ($doc_obj) = @_;
454
455 my $OID = $doc_obj->get_OID();
456 $OID = "NULL" unless defined $OID;
457
458 my $groupsize = $self->{'group_size'};
459 my $gs_count = $self->{'gs_count'};
460 my $open_new_file = (($gs_count % $groupsize)==0);
461 my $outhandle = $self->{'output_handle'};
462
463 # opening a new file, or document has assoicated files => directory needed
464 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
465
466 # The directory the archive file (doc.xml) and all associated files
467 # should end up in
468 my $doc_dir;
469 # If we've determined its time for a new file, open it now
470 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
471 {
472 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
473 # only if opening new file
474 my $output_dir = $self->get_output_dir();
475 &util::mk_all_dir ($output_dir) unless -e $output_dir;
476 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
477 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
478
479 if ($gs_count>0)
480 {
481 return if (!$self->close_file_output());
482 }
483
484 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
485
486
487 $self->{'gs_filename'} = $doc_file;
488 $self->{'short_doc_file'} = $short_doc_file;
489 $self->{'gs_OID'} = $OID;
490 $self->{'gs_doc_dir'} = $doc_dir;
491
492 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
493 }
494 # Otherwise load the same archive document directory used last time
495 else
496 {
497 $doc_dir = $self->{'gs_doc_dir'};
498 }
499
500 # copy all the associated files, add this information as metadata
501 # to the document
502 print $outhandle "Writing associated files to $doc_dir\n";
503 $self->process_assoc_files ($doc_obj, $doc_dir);
504
505 # look up 'gsdlmetafile' metadata and store that information
506 # explicitly in $doc_obj
507 $self->process_metafiles_metadata ($doc_obj);
508 }
509
510 # save this document
511 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
512 print GROUPPROCESS $section_text;
513
514 $self->{'gs_count'}++;
515}
516
517
518sub saveas {
519 my $self = shift (@_);
520
521 die "Basplug::saveas function must be implemented in sub classes\n";
522}
523
524sub get_doc_dir {
525 my $self = shift (@_);
526 my ($OID, $source_filename) = @_;
527
528 my $working_dir = $self->get_output_dir();
529 my $working_info = $self->{'output_info'};
530 return if (!defined $working_info);
531
532 my $doc_info = $working_info->get_info($OID);
533 my $doc_dir = '';
534
535 if (defined $doc_info && scalar(@$doc_info) >= 1)
536 {
537 # This OID already has an archives directory, so use it again
538 $doc_dir = $doc_info->[0];
539 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
540 }
541 elsif ($self->{'keep_import_structure'})
542 {
543 $source_filename = &File::Basename::dirname($source_filename);
544 $source_filename =~ s/[\\\/]+/\//g;
545 $source_filename =~ s/\/$//;
546
547 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
548 }
549
550 # We have to use a new archives directory for this document
551 if ($doc_dir eq "")
552 {
553 $doc_dir = $self->get_new_doc_dir ($working_info, $working_dir, $OID);
554 }
555
556 if (!defined $self->{'group'} || !$self->{'group'}){
557 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
558 }
559
560 return $doc_dir;
561}
562
563sub get_new_doc_dir{
564 my $self = shift (@_);
565 my($working_info,$working_dir,$OID) = @_;
566
567
568 my $doc_dir = "";
569 my $doc_dir_rest = $OID;
570
571 # remove any \ and / from the OID
572 $doc_dir_rest =~ s/[\\\/]//g;
573
574 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
575 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
576
577 my $doc_dir_num = 0;
578
579 do {
580 $doc_dir .= "/" if $doc_dir_num > 0;
581 if ($doc_dir_rest =~ s/^(.{1,8})//) {
582 $doc_dir .= $1;
583 $doc_dir_num++;
584 }
585 } while ($doc_dir_rest ne "" &&
586 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
587 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
588 my $i = 1;
589 my $doc_dir_base = $doc_dir;
590 while (-d &util::filename_cat ($working_dir, "$doc_dir.dir")) {
591 $doc_dir = "$doc_dir_base-$i";
592 $i++;
593 }
594
595 return "$doc_dir.dir";
596}
597
598sub process_assoc_files {
599 my $self = shift (@_);
600 my ($doc_obj, $doc_dir, $handle) = @_;
601
602 my $outhandle = $self->{'output_handle'};
603
604 my $output_dir = $self->get_output_dir();
605 return if (!defined $output_dir);
606
607 &util::mk_all_dir ($output_dir) unless -e $output_dir;
608
609 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
610 &util::mk_all_dir ($working_dir) unless -e $working_dir;
611
612 my @assoc_files = ();
613 my $filename;;
614
615 my $source_filename = $doc_obj->get_source_filename();
616
617 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
618
619 if (defined $collect_dir) {
620 my $dirsep_regexp = &util::get_os_dirsep();
621
622 if ($collect_dir !~ /$dirsep_regexp$/) {
623 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
624 }
625
626 # This test is never going to fail on Windows -- is this a problem?
627
628 if ($source_filename !~ /^$dirsep_regexp/) {
629 $source_filename = &util::filename_cat($collect_dir, $source_filename);
630 }
631 }
632
633
634 # set the assocfile path (even if we have no assoc files - need this for lucene)
635 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
636 "assocfilepath",
637 "$doc_dir");
638 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
639 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
640 $dir = "" unless defined $dir;
641
642 my $utf8_real_filename = $assoc_file_rec->[0];
643
644 # for some reasons the image associate file has / before the full path
645 $utf8_real_filename =~ s/^\\(.*)/$1/i;
646
647## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
648 my $real_filename = $utf8_real_filename;
649 $real_filename = &util::downgrade_if_dos_filename($real_filename);
650
651 if (-e $real_filename) {
652
653 $filename = &util::filename_cat($working_dir, $afile);
654
655 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
656
657 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
658 "gsdlassocfile",
659 "$afile:$assoc_file_rec->[2]:$dir");
660 } elsif ($self->{'verbosity'} > 1) {
661 print $outhandle "BasePlugout::process couldn't copy the associated file " .
662 "$real_filename to $afile\n";
663 }
664 }
665}
666
667
668sub process_metafiles_metadata
669{
670 my $self = shift (@_);
671 my ($doc_obj) = @_;
672
673 my $top_section = $doc_obj->get_top_section();
674 my $metafiles = $doc_obj->get_metadata($top_section,"gsdlmetafile");
675
676 foreach my $metafile_pair (@$metafiles) {
677 my ($full_metafile,$metafile) = split(/ : /,$metafile_pair);
678
679 $doc_obj->metadata_file($full_metafile,$metafile);
680 }
681
682 $doc_obj->delete_metadata($top_section,"gsdlmetafile");
683}
684
685sub archiveinf_files_to_field
686{
687 my $self = shift(@_);
688 my ($files,$field,$collect_dir,$oid_files,$reverse_lookups) = @_;
689
690 foreach my $file_rec (@$files) {
691 my $real_filename = (ref $file_rec eq "ARRAY") ? $file_rec->[0] : $file_rec;
692 my $full_file = (ref $file_rec eq "ARRAY") ? $file_rec->[1] : $file_rec;
693 # for some reasons the image associate file has / before the full path
694 $real_filename =~ s/^\\(.*)/$1/i;
695
696 my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
697
698 if (-e $raw_filename) {
699
700# if (defined $collect_dir) {
701# my $collect_dir_re_safe = $collect_dir;
702# $collect_dir_re_safe =~ s/\\/\\\\/g;
703# $collect_dir_re_safe =~ s/\./\\./g;##
704
705# $real_filename =~ s/^$collect_dir_re_safe//;
706# }
707
708 if (defined $reverse_lookups) {
709 $reverse_lookups->{$real_filename} = 1;
710 }
711### push(@{$oid_files->{$field}},$full_file);
712 push(@{$oid_files->{$field}},$raw_filename);
713 }
714 else {
715 print STDERR "Warning: archiveinf_files_to_field()\n $real_filename does not appear to be on the file system\n";
716 }
717 }
718}
719
720sub archiveinf_db
721{
722 my $self = shift (@_);
723 my ($doc_obj) = @_;
724
725 my $verbosity = $self->{'verbosity'};
726
727 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
728 if (defined $collect_dir) {
729 my $dirsep_regexp = &util::get_os_dirsep();
730
731 if ($collect_dir !~ /$dirsep_regexp$/) {
732 # ensure there is a slash at the end
733 $collect_dir .= &util::get_dirsep();
734 }
735 }
736
737 my $oid = $doc_obj->get_OID();
738 my $source_filename = $doc_obj->get_unmodified_source_filename();
739 my $working_info = $self->{'output_info'};
740 my $doc_info = $working_info->get_info($oid);
741
742 my ($doc_file,$index_status,$sortmeta) = @$doc_info;
743 # doc_file is the path to the archive doc.xml. Make sure it has unix
744 # slashes, then if the collection is copied to linux, it can be built without reimport
745 $doc_file =~ s/\\/\//g;
746 my $oid_files = { 'doc-file' => $doc_file,
747 'index-status' => $index_status,
748 'src-file' => $source_filename,
749 'sort-meta' => $sortmeta,
750 'assoc-file' => [],
751 'meta-file' => [] };
752
753 my $reverse_lookups = { $source_filename => "1" };
754
755
756 $self->archiveinf_files_to_field($doc_obj->get_source_assoc_files(),"assoc-file",
757 $collect_dir,$oid_files,$reverse_lookups);
758
759
760 $self->archiveinf_files_to_field($doc_obj->get_meta_files(),"meta-file",
761 $collect_dir,$oid_files);
762
763 # Get the infodbtype value for this collection from the arcinfo object
764 my $infodbtype = $self->{'output_info'}->{'infodbtype'};
765 my $output_dir = $self->{'output_dir'};
766
767 my $doc_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
768 my $src_db = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $output_dir);
769
770 ##print STDERR "*** To set in db: \n\t$doc_db\n\t$oid\n\t$doc_db_text\n";
771
772 $oid_files->{'doc-file'} = [ $oid_files->{'doc-file'} ];
773 $oid_files->{'index-status'} = [ $oid_files->{'index-status'} ];
774 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
775 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
776
777 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
778 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, $oid_files);
779 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
780
781 foreach my $rl (keys %$reverse_lookups) {
782 $working_info->add_reverseinfo($rl,$oid);
783 }
784
785 # meta files not set in reverese entry, but need to set the metadata flag
786 if (defined $doc_obj->get_meta_files()) {
787 foreach my $meta_file_rec(@{$doc_obj->get_meta_files()}) {
788 my $full_file = (ref $meta_file_rec eq "ARRAY") ? $meta_file_rec->[0] : $meta_file_rec;
789 $working_info->set_meta_file_flag($full_file);
790 }
791 }
792}
793
794
795sub set_sortmeta {
796 my $self = shift (@_);
797 my ($sortmeta, $removeprefix, $removesuffix) = @_;
798
799 $self->{'sortmeta'} = $sortmeta;
800 if (defined ($removeprefix) && $removeprefix ) {
801 $removeprefix =~ s/^\^//; # don't need a leading ^
802 $self->{'removeprefix'} = $removeprefix;
803 }
804 if (defined ($removesuffix) && $removesuffix) {
805 $removesuffix =~ s/\$$//; # don't need a trailing $
806 $self->{'removesuffix'} = $removesuffix;
807 }
808}
809
810sub open_xslt_pipe
811{
812 my $self = shift @_;
813 my ($output_file_name, $xslt_file)=@_;
814
815 return unless defined $xslt_file and $xslt_file ne "" and -e $xslt_file;
816
817 my $java_class_path = &util::filename_cat ($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
818
819 my $mapping_file_path = "";
820
821 if ($ENV{'GSDLOS'} eq "windows"){
822 $java_class_path .=";".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
823 # this file:/// bit didn't work for me on windows XP
824 #$xslt_file = "\"file:///".$xslt_file."\"";
825 #$mapping_file_path = "\"file:///";
826 }
827 else{
828 $java_class_path .=":".&util::filename_cat ($ENV{'GSDLHOME'},"bin","java","xalan.jar");
829 }
830
831
832 $java_class_path = "\"".$java_class_path."\"";
833
834 my $cmd = "| java -cp $java_class_path org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
835
836 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
837 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
838 $cmd .= "-m $mapping_file_path";
839 }
840
841 open(*XMLWRITER, $cmd)
842 or die "can't open pipe to xslt: $!";
843
844
845 $self->{'xslt_writer'} = *XMLWRITER;
846
847 print XMLWRITER "<?DocStart?>\n";
848 print XMLWRITER "$output_file_name\n";
849
850
851 }
852
853
854sub close_xslt_pipe
855{
856 my $self = shift @_;
857
858
859 return unless defined $self->{'xslt_writer'} ;
860
861 my $xsltwriter = $self->{'xslt_writer'};
862
863 print $xsltwriter "<?DocEnd?>\n";
864 close($xsltwriter);
865
866 undef $self->{'xslt_writer'};
867
868}
869
870sub close_file_output
871{
872 my ($self) = @_;
873
874 # make sure that the handle has been opened - it won't be if we failed
875 # to import any documents...
876 if (defined(fileno(GROUPPROCESS))) {
877 $self->output_xml_footer('GROUPPROCESS','Archive');
878 close GROUPPROCESS;
879 }
880
881 my $OID = $self->{'gs_OID'};
882 my $short_doc_file = $self->{'short_doc_file'};
883
884 if ($self->{'gzip'}) {
885 my $doc_file = $self->{'gs_filename'};
886 `gzip $doc_file`;
887 $doc_file .= ".gz";
888 $short_doc_file .= ".gz";
889 if (!-e $doc_file) {
890 my $outhandle = $self->{'output_handle'};
891 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
892 return 0;
893 }
894 }
895
896 # store reference in output_info
897 my $output_info = $self->{'output_info'};
898 return 0 if (!defined $output_info);
899 $output_info->add_info($OID, $short_doc_file, undef, undef);
900 return 1;
901}
902
903
904#the subclass should implement this method if is_group method could return 1.
905sub close_group_output{
906 my $self = shift (@_);
907}
908
909sub is_group {
910 my $self = shift (@_);
911 return 0;
912}
913
914my $dc_set = { Title => 1,
915 Creator => 1,
916 Subject => 1,
917 Description => 1,
918 Publisher => 1,
919 Contributor => 1,
920 Date => 1,
921 Type => 1,
922 Format => 1,
923 Identifier => 1,
924 Source => 1,
925 Language => 1,
926 Relation => 1,
927 Coverage => 1,
928 Rights => 1};
929
930
931# returns an XML representation of the dublin core metadata
932# if dc meta is not found, try ex mete
933sub get_dc_metadata {
934 my $self = shift(@_);
935 my ($doc_obj, $section, $version) = @_;
936
937 # build up string of dublin core metadata
938 $section="" unless defined $section;
939
940 my $section_ptr = $doc_obj->_lookup_section($section);
941 return "" unless defined $section_ptr;
942
943
944 my $explicit_dc = {};
945 my $explicit_ex = {};
946
947 my $all_text="";
948 foreach my $data (@{$section_ptr->{'metadata'}}){
949 my $escaped_value = &docprint::escape_text($data->[1]);
950 if ($data->[0]=~ m/^dc\./) {
951 $data->[0] =~ tr/[A-Z]/[a-z]/;
952
953 $data->[0] =~ m/^dc\.(.*)/;
954 my $dc_element = $1;
955
956 if (!defined $explicit_dc->{$dc_element}) {
957 $explicit_dc->{$dc_element} = [];
958 }
959 push(@{$explicit_dc->{$dc_element}},$escaped_value);
960
961 if (defined $version && ($version eq "oai_dc")) {
962 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
963 }
964 else {
965 # qualifier???
966 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
967 }
968
969 }
970 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
971 $data->[0] =~ m/^(ex\.)?(.*)/;
972 my $ex_element = $2;
973 my $lc_ex_element = lc($ex_element);
974
975 if (defined $dc_set->{$ex_element}) {
976 if (!defined $explicit_ex->{$lc_ex_element}) {
977 $explicit_ex->{$lc_ex_element} = [];
978 }
979 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
980 }
981 }
982 }
983
984 # go through dc_set and for any element *not* defined in explicit_dc
985 # that does exist in explicit_ex, add it in as metadata
986 foreach my $k ( keys %$dc_set ) {
987 my $lc_k = lc($k);
988
989 if (!defined $explicit_dc->{$lc_k}) {
990 if (defined $explicit_ex->{$lc_k}) {
991
992 foreach my $v (@{$explicit_ex->{$lc_k}}) {
993 my $dc_element = $lc_k;
994 my $escaped_value = $v;
995
996 if (defined $version && ($version eq "oai_dc")) {
997 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
998 }
999 else {
1000 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
1001 }
1002
1003 }
1004 }
1005 }
1006 }
1007
1008 if ($all_text eq "") {
1009 $all_text .= " There is no Dublin Core metatdata in this document\n";
1010 }
1011 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
1012
1013 return $all_text;
1014}
1015
1016# Build up dublin_core metadata. Priority given to dc.* over ex.*
1017# This method was apparently added by Jeffrey and committed by Shaoqun.
1018# But we don't know why it was added, so not using it anymore.
1019sub new_get_dc_metadata {
1020
1021 my $self = shift(@_);
1022 my ($doc_obj, $section, $version) = @_;
1023
1024 # build up string of dublin core metadata
1025 $section="" unless defined $section;
1026
1027 my $section_ptr=$doc_obj->_lookup_section($section);
1028 return "" unless defined $section_ptr;
1029
1030 my $all_text = "";
1031 foreach my $data (@{$section_ptr->{'metadata'}}){
1032 my $escaped_value = &docprint::escape_text($data->[1]);
1033 my $dc_element = $data->[0];
1034
1035 my @array = split('\.',$dc_element);
1036 my ($type,$name);
1037
1038 if(defined $array[1])
1039 {
1040 $type = $array[0];
1041 $name = $array[1];
1042 }
1043 else
1044 {
1045 $type = "ex";
1046 $name = $array[0];
1047 }
1048
1049 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
1050 }
1051 return $all_text;
1052}
1053
1054
10551;
Note: See TracBrowser for help on using the repository browser.