source: trunk/gsdl/perllib/docsave.pm@ 8517

Last change on this file since 8517 was 8517, checked in by chi, 19 years ago

Add and modify methods to deal with exporting GS collections to "METS" and "DSpace" format.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.8 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use expinfo;
38use docproc;
39use util;
40
41
42sub BEGIN {
43 @docsave::ISA = ('docproc');
44}
45
46sub new {
47 my ($class, $collection, $info, $verbosity,
48 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
49 my $self = new docproc ();
50
51 $groupsize=1 unless defined $groupsize;
52 $self->{'collection'} = $collection;
53 if ($service eq "import"){
54 $self->{'archive_info'} = $info;
55 } elsif ($service eq "export"){
56 $self->{'export_info'} = $info;
57 } else {
58 return;
59 }
60
61 $self->{'verbosity'} = $verbosity;
62 $self->{'gzip'} = $gzip;
63
64 $self->{'groupsize'} = $groupsize;
65 $self->{'gs_count'} = 0;
66
67 $self->{'outhandle'} = 'STDERR';
68 $self->{'outhandle'} = $outhandle if defined $outhandle;
69 $self->{'service'} = $service;
70 $self->{'saveas'} = $saveas;
71
72 # set a default for the archive directory
73 if ($service eq "import"){
74 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
75 } elsif ($service eq "export") {
76 # set a default for the export directory
77 $self->{'export_dir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "export");
78 } else {
79 return;
80 }
81 $self->{'sortmeta'} = undef;
82
83 return bless $self, $class;
84}
85
86sub setarchivedir {
87 my $self = shift (@_);
88 my ($archive_dir) = @_;
89
90 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
91 $self->{'archive_dir'} = $archive_dir;
92}
93
94sub setexportdir {
95 my $self = shift (@_);
96 my ($export_dir) = @_;
97
98 &util::mk_all_dir ($export_dir) unless -e $export_dir;
99 $self->{'export_dir'} = $export_dir;
100}
101
102sub set_sortmeta {
103 my $self = shift (@_);
104 my ($sortmeta) = @_;
105
106 $self->{'sortmeta'} = $sortmeta;
107 }
108
109sub process {
110 my $self = shift (@_);
111 my ($doc_obj) = @_;
112
113 my $outhandle = $self->{'outhandle'};
114 my $service = $self->{'service'} || "import";
115
116 # Define the SaveAs Type
117 my $save_as = $self->{'saveas'} || "GA";
118 my $collection = $self->{'collection'};
119
120 if ($self->{'groupsize'} > 1) {
121 $self->group_process ($doc_obj);
122 return;
123 }
124
125 my $OID = $doc_obj->get_OID();
126 $OID = "NULL" unless defined $OID;
127
128 # get document's directory
129 my $doc_dir = $self->get_doc_dir ($OID);
130
131 # groupsize is 1 (i.e. one document per XML file) so sortmeta
132 # may be used
133
134 if ($service eq "import") {
135 my $archive_info = $self->{'archive_info'};
136 } elsif ($service eq "export") {
137 my $export_info = $self->{'export_info'};
138 } else {
139 return;
140 }
141
142 # copy all the associated files, add this information as metadata
143 # to the document
144 if ($service eq "export" && $save_as eq "DSpace") {
145 # open contents file
146 my $doc_contents_file
147 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "contents");
148
149 if (!open(OUTDOC_EXPORT_CONTENTS,">$doc_contents_file")){
150 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
151 return;
152 }
153 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_EXPORT_CONTENTS');
154 } else {
155 $self->process_assoc_files ($doc_obj, $doc_dir, '');
156 }
157
158 my $doc_file;
159 my $doc_mets_file;
160 my $doc_txt_file;
161 my $short_doc_file;
162
163 #Import collection to GS2 in GS Archive format and METs format
164 if ($service eq "import") {
165 my $doc_file
166 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
167
168 #***define doctxt.xml file
169 my $doc_txt_file
170 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
171
172 my $import_working_dir
173 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
174
175 #***define docmets.xml file
176 my $doc_mets_file
177 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
178
179 if ($save_as eq "GA") {
180 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
181 } elsif ($save_as eq "METS") {
182 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
183 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
184 } else {
185 return;
186 }
187
188 if ($save_as eq "GA") {
189 if (!open (OUTDOC, ">$doc_file")) {
190 print $outhandle "docsave::process could not write to file $doc_file\n";
191 return;
192 }
193 # save this document
194 $self->output_xml_header('docsave::OUTDOC');
195 $doc_obj->output_section('docsave::OUTDOC',
196 $doc_obj->get_top_section());
197 $self->output_xml_footer('docsave::OUTDOC');
198
199 close OUTDOC;
200 } elsif ($save_as eq "METS") {
201 # save the document without metadata:doctxt.xml
202
203 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
204 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
205 return;
206 }
207
208 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
209 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
210 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
211
212 # Convert doctxt.xml file to docmets.xml
213 if (!open(OUTDOC_METS,">$doc_mets_file")){
214 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
215 return;
216 }
217
218 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
219 $doc_obj->output_mets_section('docsave::OUTDOC_METS',
220 $doc_obj->get_top_section(),
221 $import_working_dir);
222 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
223
224 close OUTDOC_TXT;
225 close OUTDOC_METS;
226 } else { # save_as isn't GA or METS
227 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
228 return;
229 }
230 }
231
232 ## Export the collection to METs format or DSpace Archive Format into the export directory
233 if ($service eq "export") {
234 my $doc_dc_file;
235 my $doc_contents_file;
236
237 my $export_working_dir
238 =&util::filename_cat ($self->{'export_dir'}, $doc_dir);
239
240 if ($save_as eq "METS") {
241 $doc_mets_file
242 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "docmets.xml");
243
244 $doc_txt_file
245 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "doctxt.xml");
246
247 if (!open(OUTDOC_EXPORT_TXT, ">$doc_txt_file")){
248 print $outhandle "docsave::process could not write TXT to file $doc_txt_file\n";
249 return;
250 }
251
252 $self->output_txt_xml_header('docsave::OUTDOC_EXPORT_TXT');
253 $doc_obj->output_txt_section('docsave::OUTDOC_EXPORT_TXT', $doc_obj->get_top_section());
254
255 if (!open(OUTDOC_EXPORT_METS,">$doc_mets_file")){
256 print $outhandle "docsave::process could not write METS format to file $doc_mets_file\n";
257 return;
258 }
259 $self->output_mets_xml_header('docsave::OUTDOC_EXPORT_METS', $OID);
260 $doc_obj->output_mets_section('docsave::OUTDOC_EXPORT_METS',$doc_obj->get_top_section(), $export_working_dir);
261 $self->output_mets_xml_footer('docsave::OUTDOC_EXPORT_METS');
262
263 close OUTDOC_EXPORT_TXT;
264 close OUTDOC_EXPORT_METS;
265 } elsif ($save_as eq "DSpace") {
266
267 # Generate dublin_core.xml file
268 $doc_dc_file
269 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "dublin_core.xml");
270
271 if (!open(OUTDOC_EXPORT_DC,">$doc_dc_file")){
272 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
273 return;
274 }
275
276 $self->output_dc_xml_header('docsave::OUTDOC_EXPORT_DC', $OID);
277 $doc_obj->output_dc_section('docsave::OUTDOC_EXPORT_DC',$doc_obj->get_top_section(), $export_working_dir);
278 $self->output_dc_xml_footer('docsave::OUTDOC_EXPORT_DC');
279
280 close OUTDOC_EXPORT_DC;
281 close OUTDOC_EXPORT_CONTENTS;
282 } else { # save_as isn't METS or DSpace
283 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
284 return;
285 }
286
287 if ($save_as eq "METS") {
288 $short_doc_file = util::filename_cat ($doc_dir, "docmets.xml");
289 } elsif ($save_as eq "DSpace") {
290 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
291 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
292 } else {
293 return;
294 }
295
296 }
297 #save for later (for close_file_output())
298 $self->{'short_doc_file'} = $short_doc_file;
299
300 if ($self->{'gzip'}) {
301 my $doc_file = $self->{'gs_filename'};
302 `gzip $doc_file`;
303 $doc_file .= ".gz";
304 $short_doc_file .= ".gz";
305 if (!-e $doc_file) {
306 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
307 return 0;
308 }
309 }
310
311 # do the sortmeta thing
312 my ($metadata);
313 if (defined ($self->{'sortmeta'})) {
314 $metadata = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
315 $self->{'sortmeta'});
316 }
317
318 # store reference in the archive_info and export_info
319 if ($service eq "export") {
320 $self->{'export_info'}->add_info($OID, $short_doc_file, $metadata);
321 } elsif ($service eq "import") {
322 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
323 }
324}
325
326
327sub group_process {
328 my $self = shift (@_);
329 my ($doc_obj) = @_;
330
331 my $outhandle = $self->{'outhandle'};
332
333 my $OID = $doc_obj->get_OID();
334 $OID = "NULL" unless defined $OID;
335
336 my $groupsize = $self->{'groupsize'};
337 my $gs_count = $self->{'gs_count'};
338 my $open_new_file = (($gs_count % $groupsize)==0);
339
340 # opening a new file, or document has assoicated files => directory needed
341 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
342
343 # get document's directory
344 my $doc_dir = $self->get_doc_dir ($OID);
345
346 # copy all the associated files, add this information as metadata
347 # to the document
348 $self->process_assoc_files ($doc_obj, $doc_dir);
349
350
351 if ($open_new_file) {
352 # only if opening new file
353 my $doc_file
354 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
355 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
356
357 if ($gs_count>0)
358 {
359 return if (!$self->close_file_output());
360 }
361
362 if (!open (OUTDOC, ">$doc_file")) {
363 print $outhandle "docsave::group_process could not write to file $doc_file\n";
364 return;
365 }
366 $self->{'gs_filename'} = $doc_file;
367 $self->{'gs_short_filename'} = $short_doc_file;
368 $self->{'gs_OID'} = $OID;
369
370 $self->output_xml_header('docsave::OUTDOC');
371 }
372 }
373
374 # save this document
375 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
376
377 $self->{'gs_count'}++;
378}
379
380sub get_doc_dir {
381 my $self = shift (@_);
382 my ($OID) = @_;
383 my $doc_info;
384 my $doc_dir;
385 my $service = $self-> {'service'};
386 my $working_dir;
387 my $working_info;
388
389 if ($service eq "import") {
390 $doc_info = $self->{'archive_info'}->get_info($OID);
391 $working_dir = $self->{'archive_dir'};
392 $working_info = $self->{'archive_info'};
393 } elsif ($service eq "export") {
394 $doc_info =$self->{'export_info'}->get_info($OID);
395 $working_dir = $self->{'export_dir'};
396 $working_info = $self->{'export_info'};
397 } else {
398 return;
399 }
400 if (defined $doc_info && scalar(@$doc_info) >= 1) {
401 # this OID already has an assigned directory, use the
402 # same one.
403 $doc_dir = $doc_info->[0];
404 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
405 } else {
406 # have to get a new document directory
407 my $doc_dir_rest = $OID;
408 my $doc_dir_num = 0;
409 do {
410 $doc_dir .= "/" if $doc_dir_num > 0;
411 if ($doc_dir_rest =~ s/^(.{1,8})//) {
412 $doc_dir .= $1;
413 $doc_dir_num++;
414 }
415 } while ($doc_dir_rest ne "" &&
416 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
417 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
418
419 $doc_dir .= ".dir";
420 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
421 }
422 return $doc_dir;
423}
424
425sub process_assoc_files {
426 my $self = shift (@_);
427 my ($doc_obj, $doc_dir, $handle) = @_;
428
429 my $outhandle = $self->{'outhandle'};
430
431 my @assoc_files = ();
432 my $filename;;
433 my $working_dir;
434 my $service = $self->{'service'};
435 my $save_as = $self->{'saveas'};
436
437 if ($service eq "import") {
438 $working_dir = $self->{'archive_dir'};
439 } elsif ($service eq "export"){
440 $working_dir = $self->{'export_dir'};
441 } else {
442 return;
443 }
444 $doc_obj->get_source_filename()=~ /\/[^\/\\]$/;
445
446 if ($save_as eq "DSpace") {
447 print $handle "$1\n";
448 $filename = &util::filename_cat($working_dir, $doc_dir, $1);
449 &util::hard_link ($doc_obj->get_source_filename(), $filename);
450 }
451
452 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
453 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
454 $dir = "" unless defined $dir;
455
456 # Store the associated file to the "contents" file
457 if ($save_as eq "DSpace") {
458 print $handle "$assoc_file->[1]\n";
459 }
460
461 if (-e $assoc_file->[0]) {
462 $filename = &util::filename_cat($working_dir, $doc_dir, $afile);
463
464 &util::hard_link ($assoc_file->[0], $filename);
465
466 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
467 "gsdlassocfile",
468 "$afile:$assoc_file->[2]:$dir");
469 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
470 "assocfilepath",
471 "$doc_dir");
472 } elsif ($self->{'verbosity'} > 2) {
473 print $outhandle "docsave::process couldn't copy the associated file " .
474 "$assoc_file->[0] to $afile\n";
475 }
476 }
477}
478
479
480sub close_file_output
481{
482 my ($self) = @_;
483 my $service =$self->{'service'};
484
485 # make sure that the handle has been opened - it won't be if we failed
486 # to import any documents...
487 if (defined(fileno(docsave::OUTDOC))) {
488 $self->output_xml_footer('docsave::OUTDOC');
489 close OUTDOC;
490 }
491
492 my $OID = $self->{'gs_OID'};
493 my $short_doc_file;
494 # can we use 'short_doc_file' for GA too?
495 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
496 $short_doc_file=$self->{'short_doc_file'};
497 } elsif ($self->{'saveas'} eq "GA") { # "GA"
498 $short_doc_file=$self->{'gs_short_filename'};
499 } else { # "DSpace"
500 }
501
502 if ($self->{'gzip'}) {
503 my $doc_file = $self->{'gs_filename'};
504 `gzip $doc_file`;
505 $doc_file .= ".gz";
506 $short_doc_file .= ".gz";
507 if (!-e $doc_file) {
508 my $outhandle = $self->{'outhandle'};
509 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
510 return 0;
511 }
512 }
513
514 # store reference in the archive_info and export_infor
515 if ($service eq "import") {
516 $self->{'archive_info'}->add_info($OID, $short_doc_file);
517 } elsif ($service eq "export") {
518 $self->{'export_info'}->add_info($OID, $short_doc_file);
519 } else {
520 return;
521 }
522 return 1;
523}
524
525sub output_xml_header {
526 my $self = shift (@_);
527 my ($handle) = @_;
528
529 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
530
531 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
532 print $handle "<Archive>\n";
533}
534
535sub output_xml_footer {
536 my $self = shift (@_);
537 my ($handle) = @_;
538
539 print $handle "</Archive>\n";
540}
541
542sub output_txt_xml_header{
543 my $self = shift (@_);
544 my ($handle) = @_;
545 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
546 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
547}
548
549sub output_txt_xml_footer{
550 my $self = shift(@_);
551 my ($handle) = @_;
552 print $handle "<the end of the file>\n";
553}
554
555sub output_mets_xml_header(){
556 my $self = shift(@_);
557 my ($handle, $OID) = @_;
558
559 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
560 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
561 print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
562}
563
564sub output_mets_xml_footer() {
565 my $self = shift(@_);
566 my ($handle) = @_;
567 print $handle '</mets:mets>' . "\n";
568}
569
570sub output_dc_xml_header(){
571 my $self = shift(@_);
572 my ($handle, $OID) = @_;
573
574 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
575# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
576 print $handle '<dublin_core>' . "\n";
577}
578
579sub output_dc_xml_footer() {
580 my $self = shift(@_);
581 my ($handle) = @_;
582 print $handle '</dublin_core>' . "\n";
583}
5841;
Note: See TracBrowser for help on using the repository browser.