source: trunk/gsdl/perllib/docsave.pm@ 10168

Last change on this file since 10168 was 10154, checked in by davidb, 19 years ago

Elimination of expinfo package, and its use in docsave.

Reasoning: expinfo package was added in when export.pl was devised. expinfo i
in fact is no different to the original arcinfo (that it was based on).
Having the two classes used in docsave therefore clutters things somewhat
with no real benefit. docsave code collapes back to use only arcinfo.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use docproc;
38use util;
39
40
41sub BEGIN {
42 @docsave::ISA = ('docproc');
43}
44
45
46sub calcoutputdir
47{
48 my ($service,$collectdir,$outhandle) = @_;
49
50 my $output_dir = undef;
51
52 if (($service eq "import") || ($service eq "unbuild")) {
53 $output_dir = &util::filename_cat ($collectdir, "archives");
54 }
55 elsif ($service eq "export") {
56 $output_dir = &util::filename_cat($collectdir, "export");
57 }
58 else {
59 print $outhandle "docsave::calcoutputdir did not recognise service ";
60 print $outhandle " '$service'. No output directory set.\n";
61 }
62
63 return $output_dir;
64}
65
66
67sub new {
68 my ($class, $collection, $info, $verbosity,
69 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
70 my $self = new docproc ();
71
72 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
73
74 $outhandle = 'STDERR' unless (defined $outhandle);
75 $service = "import" unless (defined $service);
76 $saveas = "GA" unless (defined $saveas);
77 $groupsize = 1 unless (defined $groupsize);
78
79 $self->{'collection'} = $collection;
80
81 $self->{'output_info'} = $info;
82 $self->{'output_dir'} = undef;
83
84 $self->{'verbosity'} = $verbosity;
85 $self->{'gzip'} = $gzip;
86 $self->{'keepimportstructure'} = 0;
87 $self->{'groupsize'} = $groupsize;
88 $self->{'gs_count'} = 0;
89
90 $self->{'outhandle'} = $outhandle;
91 $self->{'service'} = $service;
92 $self->{'saveas'} = $saveas;
93
94 $self->{'sortmeta'} = undef;
95
96 return bless $self, $class;
97}
98
99
100
101sub setoutputdir
102{
103 my $self = shift @_;
104 my ($output_dir) = @_;
105
106 &util::mk_all_dir ($output_dir) unless -e $output_dir;
107
108 $self->{'output_dir'} = $output_dir;
109}
110
111sub getoutputinfo
112{
113 my $self = shift (@_);
114
115 return $self->{'output_info'};
116}
117
118sub getoutputdir
119{
120 my $self = shift (@_);
121
122 return $self->{'output_dir'};
123}
124
125
126sub set_sortmeta {
127 my $self = shift (@_);
128 my ($sortmeta, $removeprefix, $removesuffix) = @_;
129
130 $self->{'sortmeta'} = $sortmeta;
131 if (defined ($removeprefix) && $removeprefix ) {
132 $removeprefix =~ s/^\^//; # don't need a leading ^
133 $self->{'removeprefix'} = $removeprefix;
134 }
135 if (defined ($removesuffix) && $removesuffix) {
136 $removesuffix =~ s/\$$//; # don't need a trailing $
137 $self->{'removesuffix'} = $removesuffix;
138 }
139}
140
141sub process {
142 my $self = shift (@_);
143 my ($doc_obj) = @_;
144
145 my $outhandle = $self->{'outhandle'};
146 my $service = $self->{'service'};
147
148 # Define the SaveAs Type
149 my $save_as = $self->{'saveas'};
150 my $collection = $self->{'collection'};
151
152 if ($self->{'groupsize'} > 1) {
153 $self->group_process ($doc_obj);
154 return;
155 }
156
157 my $OID = $doc_obj->get_OID();
158 $OID = "NULL" unless defined $OID;
159
160 my $top_section = $doc_obj->get_top_section();
161
162 # get document's directory
163 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
164
165 # groupsize is 1 (i.e. one document per XML file) so sortmeta
166 # may be used
167
168 my $output_info = $self->getoutputinfo();
169 return if (!defined $output_info);
170
171 my $output_dir = $self->getoutputdir();
172 my $working_dir = &util::filename_cat ($output_dir, $doc_dir);
173
174 # copy all the associated files, add this information as metadata
175 # to the document
176 if ($save_as eq "DSpace") {
177
178 # Genereate handle file
179 # (Note: this section of code would benefit from being restructured)
180 my $doc_handle_file = &util::filename_cat ($working_dir, "handle");
181
182 my $env_hp = $ENV{'DSPACE_HANDLE_PREFIX'};
183 my $handle_prefix = (defined $env_hp) ? $env_hp : "123456789";
184
185 if (!open(OUTDOC_HANDLE,">$doc_handle_file")){
186 print $outhandle "docsave::process could not write collection handle to file $doc_handle_file\n";
187 return;
188 }
189
190 my ($handle) = ($doc_dir =~ m/^(.*)\.dir$/);
191 print OUTDOC_HANDLE "$handle_prefix/$handle\n";
192
193 close OUTDOC_HANDLE;
194
195 # Generate contents file
196 my $doc_contents_file = &util::filename_cat ($working_dir, "contents");
197
198 if (!open(OUTDOC_CONTENTS,">$doc_contents_file")){
199 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
200 return;
201 }
202 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_CONTENTS');
203
204 close OUTDOC_CONTENTS;
205
206 } else {
207 $self->process_assoc_files ($doc_obj, $doc_dir, '');
208 }
209
210 # Save the document in the requested 'save_as' format
211
212 if ($save_as eq "GA") {
213
214 my $doc_file = &util::filename_cat ($working_dir, "doc.xml");
215
216 if (!open (OUTDOC, ">$doc_file")) {
217 print $outhandle "docsave::process could not write to file $doc_file\n";
218 return;
219 }
220
221 # save this document
222 $self->output_xml_header('docsave::OUTDOC');
223 $doc_obj->output_section('docsave::OUTDOC',$top_section);
224 $self->output_xml_footer('docsave::OUTDOC');
225
226 close OUTDOC;
227 }
228 elsif ($save_as eq "METS") {
229
230 my $doc_txt_file = &util::filename_cat ($working_dir,"doctxt.xml");
231
232 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
233 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
234 return;
235 }
236
237 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
238 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $top_section);
239 $self->output_txt_xml_footer('docsave::OUTDOC_TXT');
240
241 close OUTDOC_TXT;
242
243 # Now save the document with metadata and text structure to docmets.xml
244
245 my $doc_mets_file = &util::filename_cat ($working_dir, "docmets.xml");
246
247 my $doc_title = $doc_obj->get_metadata_element($top_section,"dc.Title");
248 if (!defined $doc_title) {
249 $doc_title = $doc_obj->get_metadata_element($top_section,"Title");
250 }
251
252 if (!open(OUTDOC_METS,">$doc_mets_file")){
253 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
254 return;
255 }
256
257 my $saveas_version = $self->{'saveas_version'};
258 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID, $doc_title);
259 $doc_obj->output_mets_section('docsave::OUTDOC_METS',$top_section,$saveas_version,$working_dir);
260 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
261
262 close OUTDOC_METS;
263 }
264 elsif ($save_as eq "DSpace") {
265
266 # Generate dublin_core.xml file
267 my $doc_dc_file = &util::filename_cat ($working_dir, "dublin_core.xml");
268
269 if (!open(OUTDOC_DC,">$doc_dc_file")){
270 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
271 return;
272 }
273
274 my $saveas_version = $self->{'saveas_version'};
275
276 $self->output_dc_xml_header('docsave::OUTDOC_DC', $OID);
277 $doc_obj->output_dc_section('docsave::OUTDOC_DC',$top_section);
278 $self->output_dc_xml_footer('docsave::OUTDOC_DC');
279
280 close OUTDOC_DC;
281 } else { # save_as isn't one of the recognised types
282 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
283 return;
284 }
285
286
287 my $short_doc_file;
288
289 if ($save_as eq "GA") {
290 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
291 } elsif ($save_as eq "METS") {
292 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
293 } elsif ($save_as eq "DSpace") {
294 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
295 } else {
296 return;
297 }
298
299 #save for later (for close_file_output())
300 $self->{'short_doc_file'} = $short_doc_file;
301
302 if ($self->{'gzip'}) {
303 my $doc_file = $self->{'gs_filename'};
304 `gzip $doc_file`;
305 $doc_file .= ".gz";
306 $short_doc_file .= ".gz";
307 if (!-e $doc_file) {
308 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
309 return 0;
310 }
311 }
312
313 # do the sortmeta thing
314 my ($metadata);
315 if (defined ($self->{'sortmeta'})) {
316 $metadata = $doc_obj->get_metadata_element($top_section,$self->{'sortmeta'});
317 }
318 if (defined ($metadata) && $metadata) {
319 # do remove prefix/suffix
320 if (defined($self->{'removeprefix'})) {
321 $metadata =~ s/^$self->{'removeprefix'}//;
322 }
323 if (defined($self->{'removesuffix'})) {
324 $metadata =~ s/$self->{'removesuffix'}$//;
325 }
326 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
327 }
328
329 # store reference in the output_info
330 $output_info->add_info($OID, $short_doc_file, undef, $metadata);
331}
332
333
334sub group_process {
335 my $self = shift (@_);
336 my ($doc_obj) = @_;
337
338 my $outhandle = $self->{'outhandle'};
339
340 my $OID = $doc_obj->get_OID();
341 $OID = "NULL" unless defined $OID;
342
343 my $groupsize = $self->{'groupsize'};
344 my $gs_count = $self->{'gs_count'};
345 my $open_new_file = (($gs_count % $groupsize)==0);
346
347 # opening a new file, or document has assoicated files => directory needed
348 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
349
350 # get document's directory
351 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
352
353 # copy all the associated files, add this information as metadata
354 # to the document
355 $self->process_assoc_files ($doc_obj, $doc_dir);
356
357
358 if ($open_new_file) {
359 # only if opening new file
360 my $output_dir = $self->getoutputdir();
361 my $doc_file
362 = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
363 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
364
365 if ($gs_count>0)
366 {
367 return if (!$self->close_file_output());
368 }
369
370 if (!open (OUTDOC, ">$doc_file")) {
371 print $outhandle "docsave::group_process could not write to file $doc_file\n";
372 return;
373 }
374 $self->{'gs_filename'} = $doc_file;
375 $self->{'gs_short_filename'} = $short_doc_file;
376 $self->{'gs_OID'} = $OID;
377
378 $self->output_xml_header('docsave::OUTDOC');
379 }
380 }
381
382 # save this document
383 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
384
385 $self->{'gs_count'}++;
386}
387
388sub get_doc_dir {
389 my $self = shift (@_);
390 my ($OID, $source_filename) = @_;
391
392 my $service = $self-> {'service'};
393
394 my $working_dir = $self->getoutputdir();
395 my $working_info = $self->getoutputinfo();
396 return if (!defined $working_info);
397
398 my $doc_info = $working_info->get_info($OID);
399 my $doc_dir = '';
400
401 if (defined $doc_info && scalar(@$doc_info) >= 1) {
402 # this OID already has an assigned directory, use the
403 # same one.
404 $doc_dir = $doc_info->[0];
405 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
406 } elsif ($self->{'keepimportstructure'}) {
407 $source_filename = &File::Basename::dirname($source_filename);
408 $source_filename =~ s/[\\\/]+/\//g;
409 $source_filename =~ s/\/$//;
410
411
412 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
413 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
414
415 }
416 if ($doc_dir eq "") {
417 # have to get a new document directory
418
419 if (($service eq "import") || ($service eq "unbuild")) {
420 my $doc_dir_rest = $OID;
421 my $doc_dir_num = 0;
422
423 do {
424 $doc_dir .= "/" if $doc_dir_num > 0;
425 if ($doc_dir_rest =~ s/^(.{1,8})//) {
426 $doc_dir .= $1;
427 $doc_dir_num++;
428 }
429 } while ($doc_dir_rest ne "" &&
430 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
431 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
432 }
433 else {
434 # Export formats such as DSpace need the directory structure to
435 # be flat. This is simple to arrange (set 'doc_dir' to be the
436 # documents OID) but breaks Windows 3.1 file system compliance.
437 # Such a loss is not a big thing in this situation as such
438 # systems don't run on Windows 3.1 anyway.
439
440 $doc_dir = $OID;
441 }
442
443
444 $doc_dir .= ".dir";
445 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
446 }
447 return $doc_dir;
448}
449
450sub process_assoc_files {
451 my $self = shift (@_);
452 my ($doc_obj, $doc_dir, $handle) = @_;
453
454 my $outhandle = $self->{'outhandle'};
455 my $service = $self->{'service'};
456 my $save_as = $self->{'saveas'};
457
458 my $output_dir = $self->getoutputdir();
459 return if (!defined $output_dir);
460
461 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
462
463 my @assoc_files = ();
464 my $filename;;
465
466 my $source_filename = $doc_obj->get_source_filename();
467
468 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
469
470 if (defined $collect_dir) {
471 my $dirsep_regexp = &util::get_os_dirsep();
472
473 if ($collect_dir !~ /$dirsep_regexp$/) {
474 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
475 }
476
477 # This test is never going to fail on Windows -- is this a problem?
478 if ($source_filename !~ /^$dirsep_regexp/) {
479 $source_filename = &util::filename_cat($collect_dir, $source_filename);
480 }
481 }
482
483
484 if ($save_as eq "DSpace") {
485 my ($tail_filename) = ($source_filename =~ m/\/([^\/\\]*)$/);
486
487 print $handle "$tail_filename\n";
488
489 $filename = &util::filename_cat($working_dir, $tail_filename);
490 &util::hard_link ($source_filename, $filename);
491 }
492
493 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
494 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
495 $dir = "" unless defined $dir;
496
497
498 my $real_filename = $assoc_file_rec->[0];
499 if (-e $real_filename) {
500
501
502 if ($save_as eq "DSpace") {
503 if ($real_filename =~ m/$source_filename$/) {
504 next;
505 }
506 else {
507 my $bundle = "bundle:ORIGINAL";
508
509 if ($afile =~ m/^thumbnail\./) {
510 $bundle = "bundle:THUMBNAIL";
511 }
512
513 # Store the associated file to the "contents" file
514 print $handle "$assoc_file_rec->[1]\t$bundle\n";
515 }
516 }
517
518 $filename = &util::filename_cat($working_dir, $afile);
519
520
521 &util::hard_link ($real_filename, $filename);
522
523 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
524 "gsdlassocfile",
525 "$afile:$assoc_file_rec->[2]:$dir");
526 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
527 "assocfilepath",
528 "$doc_dir");
529 } elsif ($self->{'verbosity'} > 2) {
530 print $outhandle "docsave::process couldn't copy the associated file " .
531 "$real_filename to $afile\n";
532 }
533 }
534}
535
536
537sub close_file_output
538{
539 my ($self) = @_;
540 my $service =$self->{'service'};
541
542 # make sure that the handle has been opened - it won't be if we failed
543 # to import any documents...
544 if (defined(fileno(docsave::OUTDOC))) {
545 $self->output_xml_footer('docsave::OUTDOC');
546 close OUTDOC;
547 }
548
549 my $OID = $self->{'gs_OID'};
550 my $short_doc_file;
551 # can we use 'short_doc_file' for GA too?
552 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
553 $short_doc_file=$self->{'short_doc_file'};
554 } elsif ($self->{'saveas'} eq "GA") { # "GA"
555 $short_doc_file=$self->{'gs_short_filename'};
556 } else { # "DSpace"
557 }
558
559 if ($self->{'gzip'}) {
560 my $doc_file = $self->{'gs_filename'};
561 `gzip $doc_file`;
562 $doc_file .= ".gz";
563 $short_doc_file .= ".gz";
564 if (!-e $doc_file) {
565 my $outhandle = $self->{'outhandle'};
566 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
567 return 0;
568 }
569 }
570
571 # store reference in output_info
572 my $output_info = $self->getoutputinfo();
573 return 0 if (!defined $output_info);
574 $output_info->add_info($OID, $short_doc_file, undef, undef);
575
576 return 1;
577}
578
579sub output_xml_header {
580 my $self = shift (@_);
581 my ($handle) = @_;
582
583 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
584
585 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
586 print $handle "<Archive>\n";
587}
588
589sub output_xml_footer {
590 my $self = shift (@_);
591 my ($handle) = @_;
592
593 print $handle "</Archive>\n";
594}
595
596sub output_txt_xml_header{
597 my $self = shift (@_);
598 my ($handle) = @_;
599 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
600 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
601}
602
603sub output_txt_xml_footer{
604 my $self = shift(@_);
605 my ($handle) = @_;
606 # Nothing needs to be output at present
607}
608
609sub output_mets_xml_header(){
610 my $self = shift(@_);
611 my ($handle, $OID, $doc_title) = @_;
612
613 my $version = $self->{'saveas_version'};
614
615 my $extra_attr = "";
616 if ($version eq "fedora") {
617 my $fnamespace = $ENV{'FEDORA_PID_NAMESPACE'};
618 my $oid_namespace = (defined $fnamespace) ? $fnamespace : "test";
619
620 $extra_attr = "OBJID=\"$oid_namespace:$OID\" TYPE=\"FedoraObject\" LABEL=\"$doc_title\"";
621 }
622 else {
623 # Greenstone METS profile
624 $extra_attr = "OBJID=\"$OID:2\"";
625 }
626
627
628 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
629 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
630 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
631 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
632 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
633 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
634 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
635 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
636 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
637 print $handle " $extra_attr>\n";
638
639 if ($version eq "fedora") {
640 print $handle '<mets:metsHdr RECORDSTATUS="A"/>'. "\n"; # A = active
641 }
642
643}
644
645sub output_mets_xml_footer() {
646 my $self = shift(@_);
647 my ($handle) = @_;
648 print $handle '</mets:mets>' . "\n";
649}
650
651sub output_dc_xml_header(){
652 my $self = shift(@_);
653 my ($handle, $OID) = @_;
654
655 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
656# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">'."\n";
657 print $handle '<dublin_core>' . "\n";
658}
659
660sub output_dc_xml_footer() {
661 my $self = shift(@_);
662 my ($handle) = @_;
663 print $handle '</dublin_core>' . "\n";
664}
6651;
Note: See TracBrowser for help on using the repository browser.