source: trunk/gsdl/perllib/docsave.pm@ 9042

Last change on this file since 9042 was 8895, checked in by chi, 19 years ago

Modification of the validated METS format in the docmets.xml.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use expinfo;
38use docproc;
39use util;
40
41
42sub BEGIN {
43 @docsave::ISA = ('docproc');
44}
45
46sub new {
47 my ($class, $collection, $info, $verbosity,
48 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
49 my $self = new docproc ();
50
51 $groupsize=1 unless defined $groupsize;
52 $self->{'collection'} = $collection;
53 if ($service eq "import"){
54 $self->{'archive_info'} = $info;
55 } elsif ($service eq "export"){
56 $self->{'export_info'} = $info;
57 } else {
58 return;
59 }
60
61 $self->{'verbosity'} = $verbosity;
62 $self->{'gzip'} = $gzip;
63 $self->{'keepimportstructure'} = 0;
64 $self->{'groupsize'} = $groupsize;
65 $self->{'gs_count'} = 0;
66
67 $self->{'outhandle'} = 'STDERR';
68 $self->{'outhandle'} = $outhandle if defined $outhandle;
69 $self->{'service'} = $service;
70 $self->{'saveas'} = $saveas;
71
72 # set a default for the archive directory
73 if ($service eq "import"){
74 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
75 } elsif ($service eq "export") {
76 # set a default for the export directory
77 $self->{'export_dir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "export");
78 } else {
79 return;
80 }
81 $self->{'sortmeta'} = undef;
82
83 return bless $self, $class;
84}
85
86sub setarchivedir {
87 my $self = shift (@_);
88 my ($archive_dir) = @_;
89
90 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
91 $self->{'archive_dir'} = $archive_dir;
92}
93
94sub setexportdir {
95 my $self = shift (@_);
96 my ($export_dir) = @_;
97
98 &util::mk_all_dir ($export_dir) unless -e $export_dir;
99 $self->{'export_dir'} = $export_dir;
100}
101
102sub set_sortmeta {
103 my $self = shift (@_);
104 my ($sortmeta, $removeprefix, $removesuffix) = @_;
105
106 $self->{'sortmeta'} = $sortmeta;
107 if (defined ($removeprefix) && $removeprefix ) {
108 $removeprefix =~ s/^\^//; # don't need a leading ^
109 $self->{'removeprefix'} = $removeprefix;
110 }
111 if (defined ($removesuffix) && $removesuffix) {
112 $removesuffix =~ s/\$$//; # don't need a trailing $
113 $self->{'removesuffix'} = $removesuffix;
114 }
115}
116
117sub process {
118 my $self = shift (@_);
119 my ($doc_obj) = @_;
120
121 my $outhandle = $self->{'outhandle'};
122 my $service = $self->{'service'} || "import";
123
124 # Define the SaveAs Type
125 my $save_as = $self->{'saveas'} || "GA";
126 my $collection = $self->{'collection'};
127
128 if ($self->{'groupsize'} > 1) {
129 $self->group_process ($doc_obj);
130 return;
131 }
132
133 my $OID = $doc_obj->get_OID();
134 $OID = "NULL" unless defined $OID;
135
136 # get document's directory
137 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
138
139 # groupsize is 1 (i.e. one document per XML file) so sortmeta
140 # may be used
141
142 if ($service eq "import") {
143 my $archive_info = $self->{'archive_info'};
144 } elsif ($service eq "export") {
145 my $export_info = $self->{'export_info'};
146 } else {
147 return;
148 }
149
150 # copy all the associated files, add this information as metadata
151 # to the document
152 if ($service eq "export" && $save_as eq "DSpace") {
153 # open contents file
154 my $doc_contents_file
155 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "contents");
156
157 if (!open(OUTDOC_EXPORT_CONTENTS,">$doc_contents_file")){
158 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
159 return;
160 }
161 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_EXPORT_CONTENTS');
162 } else {
163 $self->process_assoc_files ($doc_obj, $doc_dir, '');
164 }
165
166 my $doc_file;
167 my $doc_mets_file;
168 my $doc_txt_file;
169 my $short_doc_file;
170
171 #Import collection to GS2 in GS Archive format and METs format
172 if ($service eq "import") {
173 my $doc_file
174 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
175
176 #***define doctxt.xml file
177 my $doc_txt_file
178 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
179
180 my $import_working_dir
181 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
182
183 #***define docmets.xml file
184 my $doc_mets_file
185 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
186
187 if ($save_as eq "GA") {
188 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
189 } elsif ($save_as eq "METS") {
190 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
191 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
192 } else {
193 return;
194 }
195
196 if ($save_as eq "GA") {
197 if (!open (OUTDOC, ">$doc_file")) {
198 print $outhandle "docsave::process could not write to file $doc_file\n";
199 return;
200 }
201 # save this document
202 $self->output_xml_header('docsave::OUTDOC');
203 $doc_obj->output_section('docsave::OUTDOC',
204 $doc_obj->get_top_section());
205 $self->output_xml_footer('docsave::OUTDOC');
206
207 close OUTDOC;
208 } elsif ($save_as eq "METS") {
209 # save the document without metadata:doctxt.xml
210
211 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
212 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
213 return;
214 }
215
216 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
217 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
218 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
219
220 # Convert doctxt.xml file to docmets.xml
221 if (!open(OUTDOC_METS,">$doc_mets_file")){
222 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
223 return;
224 }
225
226 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
227 $doc_obj->output_mets_section('docsave::OUTDOC_METS',
228 $doc_obj->get_top_section());
229 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
230
231 close OUTDOC_TXT;
232 close OUTDOC_METS;
233 } else { # save_as isn't GA or METS
234 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
235 return;
236 }
237 }
238
239 ## Export the collection to METs format or DSpace Archive Format into the export directory
240 if ($service eq "export") {
241 my $doc_dc_file;
242 my $doc_contents_file;
243
244 my $export_working_dir
245 =&util::filename_cat ($self->{'export_dir'}, $doc_dir);
246
247 if ($save_as eq "METS") {
248 $doc_mets_file
249 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "docmets.xml");
250
251 $doc_txt_file
252 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "doctxt.xml");
253
254 if (!open(OUTDOC_EXPORT_TXT, ">$doc_txt_file")){
255 print $outhandle "docsave::process could not write TXT to file $doc_txt_file\n";
256 return;
257 }
258
259 $self->output_txt_xml_header('docsave::OUTDOC_EXPORT_TXT');
260 $doc_obj->output_txt_section('docsave::OUTDOC_EXPORT_TXT', $doc_obj->get_top_section());
261
262 if (!open(OUTDOC_EXPORT_METS,">$doc_mets_file")){
263 print $outhandle "docsave::process could not write METS format to file $doc_mets_file\n";
264 return;
265 }
266 $self->output_mets_xml_header('docsave::OUTDOC_EXPORT_METS', $OID);
267 $doc_obj->output_mets_section('docsave::OUTDOC_EXPORT_METS',$doc_obj->get_top_section(), $export_working_dir);
268 $self->output_mets_xml_footer('docsave::OUTDOC_EXPORT_METS');
269
270 close OUTDOC_EXPORT_TXT;
271 close OUTDOC_EXPORT_METS;
272 } elsif ($save_as eq "DSpace") {
273
274 # Generate dublin_core.xml file
275 $doc_dc_file
276 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "dublin_core.xml");
277
278 if (!open(OUTDOC_EXPORT_DC,">$doc_dc_file")){
279 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
280 return;
281 }
282
283 $self->output_dc_xml_header('docsave::OUTDOC_EXPORT_DC', $OID);
284 $doc_obj->output_dc_section('docsave::OUTDOC_EXPORT_DC',$doc_obj->get_top_section(), $export_working_dir);
285 $self->output_dc_xml_footer('docsave::OUTDOC_EXPORT_DC');
286
287 close OUTDOC_EXPORT_DC;
288 close OUTDOC_EXPORT_CONTENTS;
289 } else { # save_as isn't METS or DSpace
290 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
291 return;
292 }
293
294 if ($save_as eq "METS") {
295 $short_doc_file = util::filename_cat ($doc_dir, "docmets.xml");
296 } elsif ($save_as eq "DSpace") {
297 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
298 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
299 } else {
300 return;
301 }
302
303 }
304 #save for later (for close_file_output())
305 $self->{'short_doc_file'} = $short_doc_file;
306
307 if ($self->{'gzip'}) {
308 my $doc_file = $self->{'gs_filename'};
309 `gzip $doc_file`;
310 $doc_file .= ".gz";
311 $short_doc_file .= ".gz";
312 if (!-e $doc_file) {
313 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
314 return 0;
315 }
316 }
317
318 # do the sortmeta thing
319 my ($metadata);
320 if (defined ($self->{'sortmeta'})) {
321 $metadata = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
322 $self->{'sortmeta'});
323 }
324 if (defined ($metadata) && $metadata) {
325 # do remove prefix/suffix
326 if (defined($self->{'removeprefix'})) {
327 $metadata =~ s/^$self->{'removeprefix'}//;
328 }
329 if (defined($self->{'removesuffix'})) {
330 $metadata =~ s/$self->{'removesuffix'}$//;
331 }
332 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
333 }
334 # store reference in the archive_info and export_info
335 if ($service eq "export") {
336 $self->{'export_info'}->add_info($OID, $short_doc_file, $metadata);
337 } elsif ($service eq "import") {
338 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
339 }
340}
341
342
343sub group_process {
344 my $self = shift (@_);
345 my ($doc_obj) = @_;
346
347 my $outhandle = $self->{'outhandle'};
348
349 my $OID = $doc_obj->get_OID();
350 $OID = "NULL" unless defined $OID;
351
352 my $groupsize = $self->{'groupsize'};
353 my $gs_count = $self->{'gs_count'};
354 my $open_new_file = (($gs_count % $groupsize)==0);
355
356 # opening a new file, or document has assoicated files => directory needed
357 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
358
359 # get document's directory
360 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
361
362 # copy all the associated files, add this information as metadata
363 # to the document
364 $self->process_assoc_files ($doc_obj, $doc_dir);
365
366
367 if ($open_new_file) {
368 # only if opening new file
369 my $doc_file
370 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
371 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
372
373 if ($gs_count>0)
374 {
375 return if (!$self->close_file_output());
376 }
377
378 if (!open (OUTDOC, ">$doc_file")) {
379 print $outhandle "docsave::group_process could not write to file $doc_file\n";
380 return;
381 }
382 $self->{'gs_filename'} = $doc_file;
383 $self->{'gs_short_filename'} = $short_doc_file;
384 $self->{'gs_OID'} = $OID;
385
386 $self->output_xml_header('docsave::OUTDOC');
387 }
388 }
389
390 # save this document
391 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
392
393 $self->{'gs_count'}++;
394}
395
396sub get_doc_dir {
397 my $self = shift (@_);
398 my ($OID, $source_filename) = @_;
399 my $doc_info;
400 my $doc_dir = '';
401 my $service = $self-> {'service'};
402 my $working_dir;
403 my $working_info;
404
405 if ($service eq "import") {
406 $doc_info = $self->{'archive_info'}->get_info($OID);
407 $working_dir = $self->{'archive_dir'};
408 $working_info = $self->{'archive_info'};
409 } elsif ($service eq "export") {
410 $doc_info =$self->{'export_info'}->get_info($OID);
411 $working_dir = $self->{'export_dir'};
412 $working_info = $self->{'export_info'};
413 } else {
414 return;
415 }
416 if (defined $doc_info && scalar(@$doc_info) >= 1) {
417 # this OID already has an assigned directory, use the
418 # same one.
419 $doc_dir = $doc_info->[0];
420 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
421 } elsif ($self->{'keepimportstructure'}) {
422 $source_filename = &File::Basename::dirname($source_filename);
423 $source_filename =~ s/[\\\/]+/\//g;
424 $source_filename =~ s/\/$//;
425
426
427 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
428 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
429
430 }
431 if ($doc_dir eq "") {
432 # have to get a new document directory
433 my $doc_dir_rest = $OID;
434 my $doc_dir_num = 0;
435 do {
436 $doc_dir .= "/" if $doc_dir_num > 0;
437 if ($doc_dir_rest =~ s/^(.{1,8})//) {
438 $doc_dir .= $1;
439 $doc_dir_num++;
440 }
441 } while ($doc_dir_rest ne "" &&
442 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
443 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
444
445 $doc_dir .= ".dir";
446 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
447 }
448 return $doc_dir;
449}
450
451sub process_assoc_files {
452 my $self = shift (@_);
453 my ($doc_obj, $doc_dir, $handle) = @_;
454
455 my $outhandle = $self->{'outhandle'};
456
457 my @assoc_files = ();
458 my $filename;;
459 my $working_dir;
460 my $service = $self->{'service'};
461 my $save_as = $self->{'saveas'};
462
463 if ($service eq "import") {
464 $working_dir = $self->{'archive_dir'};
465 } elsif ($service eq "export"){
466 $working_dir = $self->{'export_dir'};
467 } else {
468 return;
469 }
470 $doc_obj->get_source_filename()=~ /\/[^\/\\]$/;
471
472 if ($save_as eq "DSpace") {
473 print $handle "$1\n";
474 $filename = &util::filename_cat($working_dir, $doc_dir, $1);
475 &util::hard_link ($doc_obj->get_source_filename(), $filename);
476 }
477
478 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
479 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
480 $dir = "" unless defined $dir;
481
482 # Store the associated file to the "contents" file
483 if ($save_as eq "DSpace") {
484 print $handle "$assoc_file->[1]\n";
485 }
486
487 if (-e $assoc_file->[0]) {
488 $filename = &util::filename_cat($working_dir, $doc_dir, $afile);
489
490 &util::hard_link ($assoc_file->[0], $filename);
491
492 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
493 "gsdlassocfile",
494 "$afile:$assoc_file->[2]:$dir");
495 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
496 "assocfilepath",
497 "$doc_dir");
498 } elsif ($self->{'verbosity'} > 2) {
499 print $outhandle "docsave::process couldn't copy the associated file " .
500 "$assoc_file->[0] to $afile\n";
501 }
502 }
503}
504
505
506sub close_file_output
507{
508 my ($self) = @_;
509 my $service =$self->{'service'};
510
511 # make sure that the handle has been opened - it won't be if we failed
512 # to import any documents...
513 if (defined(fileno(docsave::OUTDOC))) {
514 $self->output_xml_footer('docsave::OUTDOC');
515 close OUTDOC;
516 }
517
518 my $OID = $self->{'gs_OID'};
519 my $short_doc_file;
520 # can we use 'short_doc_file' for GA too?
521 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
522 $short_doc_file=$self->{'short_doc_file'};
523 } elsif ($self->{'saveas'} eq "GA") { # "GA"
524 $short_doc_file=$self->{'gs_short_filename'};
525 } else { # "DSpace"
526 }
527
528 if ($self->{'gzip'}) {
529 my $doc_file = $self->{'gs_filename'};
530 `gzip $doc_file`;
531 $doc_file .= ".gz";
532 $short_doc_file .= ".gz";
533 if (!-e $doc_file) {
534 my $outhandle = $self->{'outhandle'};
535 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
536 return 0;
537 }
538 }
539
540 # store reference in the archive_info and export_infor
541 if ($service eq "import") {
542 $self->{'archive_info'}->add_info($OID, $short_doc_file);
543 } elsif ($service eq "export") {
544 $self->{'export_info'}->add_info($OID, $short_doc_file);
545 } else {
546 return;
547 }
548 return 1;
549}
550
551sub output_xml_header {
552 my $self = shift (@_);
553 my ($handle) = @_;
554
555 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
556
557 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
558 print $handle "<Archive>\n";
559}
560
561sub output_xml_footer {
562 my $self = shift (@_);
563 my ($handle) = @_;
564
565 print $handle "</Archive>\n";
566}
567
568sub output_txt_xml_header{
569 my $self = shift (@_);
570 my ($handle) = @_;
571 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
572 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
573}
574
575sub output_txt_xml_footer{
576 my $self = shift(@_);
577 my ($handle) = @_;
578 print $handle "<the end of the file>\n";
579}
580
581sub output_mets_xml_header(){
582 my $self = shift(@_);
583 my ($handle, $OID) = @_;
584
585 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
586 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
587 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
588 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
589 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
590 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
591 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
592 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
593 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
594 print $handle ' OBJID="'. $OID. ':2">' . "\n";
595}
596
597sub output_mets_xml_footer() {
598 my $self = shift(@_);
599 my ($handle) = @_;
600 print $handle '</mets:mets>' . "\n";
601}
602
603sub output_dc_xml_header(){
604 my $self = shift(@_);
605 my ($handle, $OID) = @_;
606
607 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
608# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">'."\n";
609 print $handle '<dublin_core>' . "\n";
610}
611
612sub output_dc_xml_footer() {
613 my $self = shift(@_);
614 my ($handle) = @_;
615 print $handle '</dublin_core>' . "\n";
616}
6171;
Note: See TracBrowser for help on using the repository browser.