source: trunk/gsdl/perllib/docsave.pm@ 9620

Last change on this file since 9620 was 9231, checked in by davidb, 19 years ago

Revision to associated files for handling METS saveas case.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use expinfo;
38use docproc;
39use util;
40
41
42sub BEGIN {
43 @docsave::ISA = ('docproc');
44}
45
46sub new {
47 my ($class, $collection, $info, $verbosity,
48 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
49 my $self = new docproc ();
50
51 $groupsize=1 unless defined $groupsize;
52 $service="import" unless defined $service;
53
54 $self->{'collection'} = $collection;
55 if ($service eq "import"){
56 $self->{'archive_info'} = $info;
57 } elsif ($service eq "export"){
58 $self->{'export_info'} = $info;
59 } else {
60 return;
61 }
62
63 $self->{'verbosity'} = $verbosity;
64 $self->{'gzip'} = $gzip;
65 $self->{'keepimportstructure'} = 0;
66 $self->{'groupsize'} = $groupsize;
67 $self->{'gs_count'} = 0;
68
69 $self->{'outhandle'} = 'STDERR';
70 $self->{'outhandle'} = $outhandle if defined $outhandle;
71 $self->{'service'} = $service;
72 $self->{'saveas'} = $saveas;
73
74 # set a default for the archive directory
75 if ($service eq "import"){
76 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
77 } elsif ($service eq "export") {
78 # set a default for the export directory
79 $self->{'export_dir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "export");
80 } else {
81 return;
82 }
83 $self->{'sortmeta'} = undef;
84
85 return bless $self, $class;
86}
87
88sub setarchivedir {
89 my $self = shift (@_);
90 my ($archive_dir) = @_;
91
92 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
93 $self->{'archive_dir'} = $archive_dir;
94}
95
96sub setexportdir {
97 my $self = shift (@_);
98 my ($export_dir) = @_;
99
100 &util::mk_all_dir ($export_dir) unless -e $export_dir;
101 $self->{'export_dir'} = $export_dir;
102}
103
104sub set_sortmeta {
105 my $self = shift (@_);
106 my ($sortmeta, $removeprefix, $removesuffix) = @_;
107
108 $self->{'sortmeta'} = $sortmeta;
109 if (defined ($removeprefix) && $removeprefix ) {
110 $removeprefix =~ s/^\^//; # don't need a leading ^
111 $self->{'removeprefix'} = $removeprefix;
112 }
113 if (defined ($removesuffix) && $removesuffix) {
114 $removesuffix =~ s/\$$//; # don't need a trailing $
115 $self->{'removesuffix'} = $removesuffix;
116 }
117}
118
119sub process {
120 my $self = shift (@_);
121 my ($doc_obj) = @_;
122
123 my $outhandle = $self->{'outhandle'};
124 my $service = $self->{'service'} || "import";
125
126 # Define the SaveAs Type
127 my $save_as = $self->{'saveas'} || "GA";
128 my $collection = $self->{'collection'};
129
130 if ($self->{'groupsize'} > 1) {
131 $self->group_process ($doc_obj);
132 return;
133 }
134
135 my $OID = $doc_obj->get_OID();
136 $OID = "NULL" unless defined $OID;
137
138 # get document's directory
139 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
140
141 # groupsize is 1 (i.e. one document per XML file) so sortmeta
142 # may be used
143
144 if ($service eq "import") {
145 my $archive_info = $self->{'archive_info'};
146 } elsif ($service eq "export") {
147 my $export_info = $self->{'export_info'};
148 } else {
149 return;
150 }
151
152 # copy all the associated files, add this information as metadata
153 # to the document
154 if ($service eq "export" && $save_as eq "DSpace") {
155 # open contents file
156 my $doc_contents_file
157 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "contents");
158
159 if (!open(OUTDOC_EXPORT_CONTENTS,">$doc_contents_file")){
160 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
161 return;
162 }
163 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_EXPORT_CONTENTS');
164 } else {
165 $self->process_assoc_files ($doc_obj, $doc_dir, '');
166 }
167
168 my $doc_file;
169 my $doc_mets_file;
170 my $doc_txt_file;
171 my $short_doc_file;
172
173 #Import collection to GS2 in GS Archive format and METs format
174 if ($service eq "import") {
175 my $doc_file
176 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
177
178 #***define doctxt.xml file
179 my $doc_txt_file
180 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
181
182 my $import_working_dir
183 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
184
185 #***define docmets.xml file
186 my $doc_mets_file
187 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
188
189 if ($save_as eq "GA") {
190 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
191 } elsif ($save_as eq "METS") {
192 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
193 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
194 } else {
195 return;
196 }
197
198 if ($save_as eq "GA") {
199 if (!open (OUTDOC, ">$doc_file")) {
200 print $outhandle "docsave::process could not write to file $doc_file\n";
201 return;
202 }
203 # save this document
204 $self->output_xml_header('docsave::OUTDOC');
205 $doc_obj->output_section('docsave::OUTDOC',
206 $doc_obj->get_top_section());
207 $self->output_xml_footer('docsave::OUTDOC');
208
209 close OUTDOC;
210 } elsif ($save_as eq "METS") {
211 # save the document without metadata:doctxt.xml
212
213 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
214 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
215 return;
216 }
217
218 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
219 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
220 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
221
222 # Convert doctxt.xml file to docmets.xml
223 if (!open(OUTDOC_METS,">$doc_mets_file")){
224 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
225 return;
226 }
227
228 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
229 $doc_obj->output_mets_section('docsave::OUTDOC_METS',
230 $doc_obj->get_top_section());
231 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
232
233 close OUTDOC_TXT;
234 close OUTDOC_METS;
235 } else { # save_as isn't GA or METS
236 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
237 return;
238 }
239 }
240
241 ## Export the collection to METs format or DSpace Archive Format into the export directory
242 if ($service eq "export") {
243 my $doc_dc_file;
244 my $doc_contents_file;
245
246 my $export_working_dir
247 =&util::filename_cat ($self->{'export_dir'}, $doc_dir);
248
249 if ($save_as eq "METS") {
250 $doc_mets_file
251 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "docmets.xml");
252
253 $doc_txt_file
254 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "doctxt.xml");
255
256 if (!open(OUTDOC_EXPORT_TXT, ">$doc_txt_file")){
257 print $outhandle "docsave::process could not write TXT to file $doc_txt_file\n";
258 return;
259 }
260
261 $self->output_txt_xml_header('docsave::OUTDOC_EXPORT_TXT');
262 $doc_obj->output_txt_section('docsave::OUTDOC_EXPORT_TXT', $doc_obj->get_top_section());
263
264 if (!open(OUTDOC_EXPORT_METS,">$doc_mets_file")){
265 print $outhandle "docsave::process could not write METS format to file $doc_mets_file\n";
266 return;
267 }
268 $self->output_mets_xml_header('docsave::OUTDOC_EXPORT_METS', $OID);
269 $doc_obj->output_mets_section('docsave::OUTDOC_EXPORT_METS',$doc_obj->get_top_section(), $export_working_dir);
270 $self->output_mets_xml_footer('docsave::OUTDOC_EXPORT_METS');
271
272 close OUTDOC_EXPORT_TXT;
273 close OUTDOC_EXPORT_METS;
274 } elsif ($save_as eq "DSpace") {
275
276 # Generate dublin_core.xml file
277 $doc_dc_file
278 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "dublin_core.xml");
279
280 if (!open(OUTDOC_EXPORT_DC,">$doc_dc_file")){
281 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
282 return;
283 }
284
285 $self->output_dc_xml_header('docsave::OUTDOC_EXPORT_DC', $OID);
286 $doc_obj->output_dc_section('docsave::OUTDOC_EXPORT_DC',$doc_obj->get_top_section(), $export_working_dir);
287 $self->output_dc_xml_footer('docsave::OUTDOC_EXPORT_DC');
288
289 close OUTDOC_EXPORT_DC;
290 close OUTDOC_EXPORT_CONTENTS;
291 } else { # save_as isn't METS or DSpace
292 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
293 return;
294 }
295
296 if ($save_as eq "METS") {
297 $short_doc_file = util::filename_cat ($doc_dir, "docmets.xml");
298 } elsif ($save_as eq "DSpace") {
299 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
300 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
301 } else {
302 return;
303 }
304
305 }
306 #save for later (for close_file_output())
307 $self->{'short_doc_file'} = $short_doc_file;
308
309 if ($self->{'gzip'}) {
310 my $doc_file = $self->{'gs_filename'};
311 `gzip $doc_file`;
312 $doc_file .= ".gz";
313 $short_doc_file .= ".gz";
314 if (!-e $doc_file) {
315 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
316 return 0;
317 }
318 }
319
320 # do the sortmeta thing
321 my ($metadata);
322 if (defined ($self->{'sortmeta'})) {
323 $metadata = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
324 $self->{'sortmeta'});
325 }
326 if (defined ($metadata) && $metadata) {
327 # do remove prefix/suffix
328 if (defined($self->{'removeprefix'})) {
329 $metadata =~ s/^$self->{'removeprefix'}//;
330 }
331 if (defined($self->{'removesuffix'})) {
332 $metadata =~ s/$self->{'removesuffix'}$//;
333 }
334 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
335 }
336 # store reference in the archive_info and export_info
337 if ($service eq "export") {
338 $self->{'export_info'}->add_info($OID, $short_doc_file, $metadata);
339 } elsif ($service eq "import") {
340 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
341 }
342}
343
344
345sub group_process {
346 my $self = shift (@_);
347 my ($doc_obj) = @_;
348
349 my $outhandle = $self->{'outhandle'};
350
351 my $OID = $doc_obj->get_OID();
352 $OID = "NULL" unless defined $OID;
353
354 my $groupsize = $self->{'groupsize'};
355 my $gs_count = $self->{'gs_count'};
356 my $open_new_file = (($gs_count % $groupsize)==0);
357
358 # opening a new file, or document has assoicated files => directory needed
359 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
360
361 # get document's directory
362 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
363
364 # copy all the associated files, add this information as metadata
365 # to the document
366 $self->process_assoc_files ($doc_obj, $doc_dir);
367
368
369 if ($open_new_file) {
370 # only if opening new file
371 my $doc_file
372 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
373 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
374
375 if ($gs_count>0)
376 {
377 return if (!$self->close_file_output());
378 }
379
380 if (!open (OUTDOC, ">$doc_file")) {
381 print $outhandle "docsave::group_process could not write to file $doc_file\n";
382 return;
383 }
384 $self->{'gs_filename'} = $doc_file;
385 $self->{'gs_short_filename'} = $short_doc_file;
386 $self->{'gs_OID'} = $OID;
387
388 $self->output_xml_header('docsave::OUTDOC');
389 }
390 }
391
392 # save this document
393 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
394
395 $self->{'gs_count'}++;
396}
397
398sub get_doc_dir {
399 my $self = shift (@_);
400 my ($OID, $source_filename) = @_;
401 my $doc_info;
402 my $doc_dir = '';
403 my $service = $self-> {'service'};
404 my $working_dir;
405 my $working_info;
406
407 if ($service eq "import") {
408 $doc_info = $self->{'archive_info'}->get_info($OID);
409 $working_dir = $self->{'archive_dir'};
410 $working_info = $self->{'archive_info'};
411 } elsif ($service eq "export") {
412 $doc_info =$self->{'export_info'}->get_info($OID);
413 $working_dir = $self->{'export_dir'};
414 $working_info = $self->{'export_info'};
415 } else {
416 return;
417 }
418 if (defined $doc_info && scalar(@$doc_info) >= 1) {
419 # this OID already has an assigned directory, use the
420 # same one.
421 $doc_dir = $doc_info->[0];
422 $doc_dir =~ s/\/?doc(mets)?\.xml(\.gz)?$//;
423 } elsif ($self->{'keepimportstructure'}) {
424 $source_filename = &File::Basename::dirname($source_filename);
425 $source_filename =~ s/[\\\/]+/\//g;
426 $source_filename =~ s/\/$//;
427
428
429 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
430 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
431
432 }
433 if ($doc_dir eq "") {
434 # have to get a new document directory
435 my $doc_dir_rest = $OID;
436 my $doc_dir_num = 0;
437 do {
438 $doc_dir .= "/" if $doc_dir_num > 0;
439 if ($doc_dir_rest =~ s/^(.{1,8})//) {
440 $doc_dir .= $1;
441 $doc_dir_num++;
442 }
443 } while ($doc_dir_rest ne "" &&
444 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
445 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
446
447 $doc_dir .= ".dir";
448 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
449 }
450 return $doc_dir;
451}
452
453sub process_assoc_files {
454 my $self = shift (@_);
455 my ($doc_obj, $doc_dir, $handle) = @_;
456
457 my $outhandle = $self->{'outhandle'};
458
459 my @assoc_files = ();
460 my $filename;;
461 my $working_dir;
462 my $service = $self->{'service'};
463 my $save_as = $self->{'saveas'};
464
465 if ($service eq "import") {
466 $working_dir = $self->{'archive_dir'};
467 } elsif ($service eq "export"){
468 $working_dir = $self->{'export_dir'};
469 } else {
470 return;
471 }
472
473 if ($save_as eq "DSpace") {
474
475 my ($tail_filename) = ($doc_obj->get_source_filename() =~ m/\/([^\/\\]*)$/);
476
477 print $handle "$tail_filename\n";
478
479 $filename = &util::filename_cat($working_dir, $doc_dir, $tail_filename);
480 &util::hard_link ($doc_obj->get_source_filename(), $filename);
481 }
482
483 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
484 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
485 $dir = "" unless defined $dir;
486
487 # Store the associated file to the "contents" file
488 if ($save_as eq "DSpace") {
489 print $handle "$assoc_file->[1]\n";
490 }
491
492 if (-e $assoc_file->[0]) {
493 $filename = &util::filename_cat($working_dir, $doc_dir, $afile);
494
495 &util::hard_link ($assoc_file->[0], $filename);
496
497 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
498 "gsdlassocfile",
499 "$afile:$assoc_file->[2]:$dir");
500 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
501 "assocfilepath",
502 "$doc_dir");
503 } elsif ($self->{'verbosity'} > 2) {
504 print $outhandle "docsave::process couldn't copy the associated file " .
505 "$assoc_file->[0] to $afile\n";
506 }
507 }
508}
509
510
511sub close_file_output
512{
513 my ($self) = @_;
514 my $service =$self->{'service'};
515
516 # make sure that the handle has been opened - it won't be if we failed
517 # to import any documents...
518 if (defined(fileno(docsave::OUTDOC))) {
519 $self->output_xml_footer('docsave::OUTDOC');
520 close OUTDOC;
521 }
522
523 my $OID = $self->{'gs_OID'};
524 my $short_doc_file;
525 # can we use 'short_doc_file' for GA too?
526 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
527 $short_doc_file=$self->{'short_doc_file'};
528 } elsif ($self->{'saveas'} eq "GA") { # "GA"
529 $short_doc_file=$self->{'gs_short_filename'};
530 } else { # "DSpace"
531 }
532
533 if ($self->{'gzip'}) {
534 my $doc_file = $self->{'gs_filename'};
535 `gzip $doc_file`;
536 $doc_file .= ".gz";
537 $short_doc_file .= ".gz";
538 if (!-e $doc_file) {
539 my $outhandle = $self->{'outhandle'};
540 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
541 return 0;
542 }
543 }
544
545 # store reference in the archive_info and export_infor
546 if ($service eq "import") {
547 $self->{'archive_info'}->add_info($OID, $short_doc_file);
548 } elsif ($service eq "export") {
549 $self->{'export_info'}->add_info($OID, $short_doc_file);
550 } else {
551 return;
552 }
553 return 1;
554}
555
556sub output_xml_header {
557 my $self = shift (@_);
558 my ($handle) = @_;
559
560 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
561
562 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
563 print $handle "<Archive>\n";
564}
565
566sub output_xml_footer {
567 my $self = shift (@_);
568 my ($handle) = @_;
569
570 print $handle "</Archive>\n";
571}
572
573sub output_txt_xml_header{
574 my $self = shift (@_);
575 my ($handle) = @_;
576 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
577 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
578}
579
580sub output_txt_xml_footer{
581 my $self = shift(@_);
582 my ($handle) = @_;
583 print $handle "<the end of the file>\n";
584}
585
586sub output_mets_xml_header(){
587 my $self = shift(@_);
588 my ($handle, $OID) = @_;
589
590 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
591 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
592 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
593 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
594 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
595 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
596 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
597 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
598 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
599 print $handle ' OBJID="'. $OID. ':2">' . "\n";
600}
601
602sub output_mets_xml_footer() {
603 my $self = shift(@_);
604 my ($handle) = @_;
605 print $handle '</mets:mets>' . "\n";
606}
607
608sub output_dc_xml_header(){
609 my $self = shift(@_);
610 my ($handle, $OID) = @_;
611
612 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
613# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">'."\n";
614 print $handle '<dublin_core>' . "\n";
615}
616
617sub output_dc_xml_footer() {
618 my $self = shift(@_);
619 my ($handle) = @_;
620 print $handle '</dublin_core>' . "\n";
621}
6221;
Note: See TracBrowser for help on using the repository browser.