source: trunk/gsdl/perllib/docsave.pm@ 8891

Last change on this file since 8891 was 8854, checked in by kjdon, 19 years ago

we now format the metadata used for sorting the import docs, can also use removeprefix and removesuffix

  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use expinfo;
38use docproc;
39use util;
40
41
42sub BEGIN {
43 @docsave::ISA = ('docproc');
44}
45
46sub new {
47 my ($class, $collection, $info, $verbosity,
48 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
49 my $self = new docproc ();
50
51 $groupsize=1 unless defined $groupsize;
52 $self->{'collection'} = $collection;
53 if ($service eq "import"){
54 $self->{'archive_info'} = $info;
55 } elsif ($service eq "export"){
56 $self->{'export_info'} = $info;
57 } else {
58 return;
59 }
60
61 $self->{'verbosity'} = $verbosity;
62 $self->{'gzip'} = $gzip;
63 $self->{'keepimportstructure'} = 0;
64 $self->{'groupsize'} = $groupsize;
65 $self->{'gs_count'} = 0;
66
67 $self->{'outhandle'} = 'STDERR';
68 $self->{'outhandle'} = $outhandle if defined $outhandle;
69 $self->{'service'} = $service;
70 $self->{'saveas'} = $saveas;
71
72 # set a default for the archive directory
73 if ($service eq "import"){
74 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
75 } elsif ($service eq "export") {
76 # set a default for the export directory
77 $self->{'export_dir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "export");
78 } else {
79 return;
80 }
81 $self->{'sortmeta'} = undef;
82
83 return bless $self, $class;
84}
85
86sub setarchivedir {
87 my $self = shift (@_);
88 my ($archive_dir) = @_;
89
90 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
91 $self->{'archive_dir'} = $archive_dir;
92}
93
94sub setexportdir {
95 my $self = shift (@_);
96 my ($export_dir) = @_;
97
98 &util::mk_all_dir ($export_dir) unless -e $export_dir;
99 $self->{'export_dir'} = $export_dir;
100}
101
102sub set_sortmeta {
103 my $self = shift (@_);
104 my ($sortmeta, $removeprefix, $removesuffix) = @_;
105
106 $self->{'sortmeta'} = $sortmeta;
107 if (defined ($removeprefix) && $removeprefix ) {
108 $removeprefix =~ s/^\^//; # don't need a leading ^
109 $self->{'removeprefix'} = $removeprefix;
110 }
111 if (defined ($removesuffix) && $removesuffix) {
112 $removesuffix =~ s/\$$//; # don't need a trailing $
113 $self->{'removesuffix'} = $removesuffix;
114 }
115}
116
117sub process {
118 my $self = shift (@_);
119 my ($doc_obj) = @_;
120
121 my $outhandle = $self->{'outhandle'};
122 my $service = $self->{'service'} || "import";
123
124 # Define the SaveAs Type
125 my $save_as = $self->{'saveas'} || "GA";
126 my $collection = $self->{'collection'};
127
128 if ($self->{'groupsize'} > 1) {
129 $self->group_process ($doc_obj);
130 return;
131 }
132
133 my $OID = $doc_obj->get_OID();
134 $OID = "NULL" unless defined $OID;
135
136 # get document's directory
137 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
138
139 # groupsize is 1 (i.e. one document per XML file) so sortmeta
140 # may be used
141
142 if ($service eq "import") {
143 my $archive_info = $self->{'archive_info'};
144 } elsif ($service eq "export") {
145 my $export_info = $self->{'export_info'};
146 } else {
147 return;
148 }
149
150 # copy all the associated files, add this information as metadata
151 # to the document
152 if ($service eq "export" && $save_as eq "DSpace") {
153 # open contents file
154 my $doc_contents_file
155 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "contents");
156
157 if (!open(OUTDOC_EXPORT_CONTENTS,">$doc_contents_file")){
158 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
159 return;
160 }
161 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_EXPORT_CONTENTS');
162 } else {
163 $self->process_assoc_files ($doc_obj, $doc_dir, '');
164 }
165
166 my $doc_file;
167 my $doc_mets_file;
168 my $doc_txt_file;
169 my $short_doc_file;
170
171 #Import collection to GS2 in GS Archive format and METs format
172 if ($service eq "import") {
173 my $doc_file
174 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
175
176 #***define doctxt.xml file
177 my $doc_txt_file
178 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
179
180 my $import_working_dir
181 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
182
183 #***define docmets.xml file
184 my $doc_mets_file
185 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
186
187 if ($save_as eq "GA") {
188 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
189 } elsif ($save_as eq "METS") {
190 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
191 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
192 } else {
193 return;
194 }
195
196 if ($save_as eq "GA") {
197 if (!open (OUTDOC, ">$doc_file")) {
198 print $outhandle "docsave::process could not write to file $doc_file\n";
199 return;
200 }
201 # save this document
202 $self->output_xml_header('docsave::OUTDOC');
203 $doc_obj->output_section('docsave::OUTDOC',
204 $doc_obj->get_top_section());
205 $self->output_xml_footer('docsave::OUTDOC');
206
207 close OUTDOC;
208 } elsif ($save_as eq "METS") {
209 # save the document without metadata:doctxt.xml
210
211 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
212 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
213 return;
214 }
215
216 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
217 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
218 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
219
220 # Convert doctxt.xml file to docmets.xml
221 if (!open(OUTDOC_METS,">$doc_mets_file")){
222 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
223 return;
224 }
225
226 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
227 $doc_obj->output_mets_section('docsave::OUTDOC_METS',
228 $doc_obj->get_top_section(),
229 $import_working_dir);
230 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
231
232 close OUTDOC_TXT;
233 close OUTDOC_METS;
234 } else { # save_as isn't GA or METS
235 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
236 return;
237 }
238 }
239
240 ## Export the collection to METs format or DSpace Archive Format into the export directory
241 if ($service eq "export") {
242 my $doc_dc_file;
243 my $doc_contents_file;
244
245 my $export_working_dir
246 =&util::filename_cat ($self->{'export_dir'}, $doc_dir);
247
248 if ($save_as eq "METS") {
249 $doc_mets_file
250 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "docmets.xml");
251
252 $doc_txt_file
253 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "doctxt.xml");
254
255 if (!open(OUTDOC_EXPORT_TXT, ">$doc_txt_file")){
256 print $outhandle "docsave::process could not write TXT to file $doc_txt_file\n";
257 return;
258 }
259
260 $self->output_txt_xml_header('docsave::OUTDOC_EXPORT_TXT');
261 $doc_obj->output_txt_section('docsave::OUTDOC_EXPORT_TXT', $doc_obj->get_top_section());
262
263 if (!open(OUTDOC_EXPORT_METS,">$doc_mets_file")){
264 print $outhandle "docsave::process could not write METS format to file $doc_mets_file\n";
265 return;
266 }
267 $self->output_mets_xml_header('docsave::OUTDOC_EXPORT_METS', $OID);
268 $doc_obj->output_mets_section('docsave::OUTDOC_EXPORT_METS',$doc_obj->get_top_section(), $export_working_dir);
269 $self->output_mets_xml_footer('docsave::OUTDOC_EXPORT_METS');
270
271 close OUTDOC_EXPORT_TXT;
272 close OUTDOC_EXPORT_METS;
273 } elsif ($save_as eq "DSpace") {
274
275 # Generate dublin_core.xml file
276 $doc_dc_file
277 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "dublin_core.xml");
278
279 if (!open(OUTDOC_EXPORT_DC,">$doc_dc_file")){
280 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
281 return;
282 }
283
284 $self->output_dc_xml_header('docsave::OUTDOC_EXPORT_DC', $OID);
285 $doc_obj->output_dc_section('docsave::OUTDOC_EXPORT_DC',$doc_obj->get_top_section(), $export_working_dir);
286 $self->output_dc_xml_footer('docsave::OUTDOC_EXPORT_DC');
287
288 close OUTDOC_EXPORT_DC;
289 close OUTDOC_EXPORT_CONTENTS;
290 } else { # save_as isn't METS or DSpace
291 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
292 return;
293 }
294
295 if ($save_as eq "METS") {
296 $short_doc_file = util::filename_cat ($doc_dir, "docmets.xml");
297 } elsif ($save_as eq "DSpace") {
298 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
299 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
300 } else {
301 return;
302 }
303
304 }
305 #save for later (for close_file_output())
306 $self->{'short_doc_file'} = $short_doc_file;
307
308 if ($self->{'gzip'}) {
309 my $doc_file = $self->{'gs_filename'};
310 `gzip $doc_file`;
311 $doc_file .= ".gz";
312 $short_doc_file .= ".gz";
313 if (!-e $doc_file) {
314 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
315 return 0;
316 }
317 }
318
319 # do the sortmeta thing
320 my ($metadata);
321 if (defined ($self->{'sortmeta'})) {
322 $metadata = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
323 $self->{'sortmeta'});
324 }
325 if (defined ($metadata) && $metadata) {
326 # do remove prefix/suffix
327 if (defined($self->{'removeprefix'})) {
328 $metadata =~ s/^$self->{'removeprefix'}//;
329 }
330 if (defined($self->{'removesuffix'})) {
331 $metadata =~ s/$self->{'removesuffix'}$//;
332 }
333 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
334 }
335 # store reference in the archive_info and export_info
336 if ($service eq "export") {
337 $self->{'export_info'}->add_info($OID, $short_doc_file, $metadata);
338 } elsif ($service eq "import") {
339 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
340 }
341}
342
343
344sub group_process {
345 my $self = shift (@_);
346 my ($doc_obj) = @_;
347
348 my $outhandle = $self->{'outhandle'};
349
350 my $OID = $doc_obj->get_OID();
351 $OID = "NULL" unless defined $OID;
352
353 my $groupsize = $self->{'groupsize'};
354 my $gs_count = $self->{'gs_count'};
355 my $open_new_file = (($gs_count % $groupsize)==0);
356
357 # opening a new file, or document has assoicated files => directory needed
358 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
359
360 # get document's directory
361 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
362
363 # copy all the associated files, add this information as metadata
364 # to the document
365 $self->process_assoc_files ($doc_obj, $doc_dir);
366
367
368 if ($open_new_file) {
369 # only if opening new file
370 my $doc_file
371 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
372 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
373
374 if ($gs_count>0)
375 {
376 return if (!$self->close_file_output());
377 }
378
379 if (!open (OUTDOC, ">$doc_file")) {
380 print $outhandle "docsave::group_process could not write to file $doc_file\n";
381 return;
382 }
383 $self->{'gs_filename'} = $doc_file;
384 $self->{'gs_short_filename'} = $short_doc_file;
385 $self->{'gs_OID'} = $OID;
386
387 $self->output_xml_header('docsave::OUTDOC');
388 }
389 }
390
391 # save this document
392 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
393
394 $self->{'gs_count'}++;
395}
396
397sub get_doc_dir {
398 my $self = shift (@_);
399 my ($OID, $source_filename) = @_;
400 my $doc_info;
401 my $doc_dir = '';
402 my $service = $self-> {'service'};
403 my $working_dir;
404 my $working_info;
405
406 if ($service eq "import") {
407 $doc_info = $self->{'archive_info'}->get_info($OID);
408 $working_dir = $self->{'archive_dir'};
409 $working_info = $self->{'archive_info'};
410 } elsif ($service eq "export") {
411 $doc_info =$self->{'export_info'}->get_info($OID);
412 $working_dir = $self->{'export_dir'};
413 $working_info = $self->{'export_info'};
414 } else {
415 return;
416 }
417 if (defined $doc_info && scalar(@$doc_info) >= 1) {
418 # this OID already has an assigned directory, use the
419 # same one.
420 $doc_dir = $doc_info->[0];
421 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
422 } elsif ($self->{'keepimportstructure'}) {
423 $source_filename = &File::Basename::dirname($source_filename);
424 $source_filename =~ s/[\\\/]+/\//g;
425 $source_filename =~ s/\/$//;
426
427
428 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
429 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
430
431 }
432 if ($doc_dir eq "") {
433 # have to get a new document directory
434 my $doc_dir_rest = $OID;
435 my $doc_dir_num = 0;
436 do {
437 $doc_dir .= "/" if $doc_dir_num > 0;
438 if ($doc_dir_rest =~ s/^(.{1,8})//) {
439 $doc_dir .= $1;
440 $doc_dir_num++;
441 }
442 } while ($doc_dir_rest ne "" &&
443 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
444 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
445
446 $doc_dir .= ".dir";
447 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
448 }
449 return $doc_dir;
450}
451
452sub process_assoc_files {
453 my $self = shift (@_);
454 my ($doc_obj, $doc_dir, $handle) = @_;
455
456 my $outhandle = $self->{'outhandle'};
457
458 my @assoc_files = ();
459 my $filename;;
460 my $working_dir;
461 my $service = $self->{'service'};
462 my $save_as = $self->{'saveas'};
463
464 if ($service eq "import") {
465 $working_dir = $self->{'archive_dir'};
466 } elsif ($service eq "export"){
467 $working_dir = $self->{'export_dir'};
468 } else {
469 return;
470 }
471 $doc_obj->get_source_filename()=~ /\/[^\/\\]$/;
472
473 if ($save_as eq "DSpace") {
474 print $handle "$1\n";
475 $filename = &util::filename_cat($working_dir, $doc_dir, $1);
476 &util::hard_link ($doc_obj->get_source_filename(), $filename);
477 }
478
479 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
480 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
481 $dir = "" unless defined $dir;
482
483 # Store the associated file to the "contents" file
484 if ($save_as eq "DSpace") {
485 print $handle "$assoc_file->[1]\n";
486 }
487
488 if (-e $assoc_file->[0]) {
489 $filename = &util::filename_cat($working_dir, $doc_dir, $afile);
490
491 &util::hard_link ($assoc_file->[0], $filename);
492
493 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
494 "gsdlassocfile",
495 "$afile:$assoc_file->[2]:$dir");
496 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
497 "assocfilepath",
498 "$doc_dir");
499 } elsif ($self->{'verbosity'} > 2) {
500 print $outhandle "docsave::process couldn't copy the associated file " .
501 "$assoc_file->[0] to $afile\n";
502 }
503 }
504}
505
506
507sub close_file_output
508{
509 my ($self) = @_;
510 my $service =$self->{'service'};
511
512 # make sure that the handle has been opened - it won't be if we failed
513 # to import any documents...
514 if (defined(fileno(docsave::OUTDOC))) {
515 $self->output_xml_footer('docsave::OUTDOC');
516 close OUTDOC;
517 }
518
519 my $OID = $self->{'gs_OID'};
520 my $short_doc_file;
521 # can we use 'short_doc_file' for GA too?
522 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
523 $short_doc_file=$self->{'short_doc_file'};
524 } elsif ($self->{'saveas'} eq "GA") { # "GA"
525 $short_doc_file=$self->{'gs_short_filename'};
526 } else { # "DSpace"
527 }
528
529 if ($self->{'gzip'}) {
530 my $doc_file = $self->{'gs_filename'};
531 `gzip $doc_file`;
532 $doc_file .= ".gz";
533 $short_doc_file .= ".gz";
534 if (!-e $doc_file) {
535 my $outhandle = $self->{'outhandle'};
536 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
537 return 0;
538 }
539 }
540
541 # store reference in the archive_info and export_infor
542 if ($service eq "import") {
543 $self->{'archive_info'}->add_info($OID, $short_doc_file);
544 } elsif ($service eq "export") {
545 $self->{'export_info'}->add_info($OID, $short_doc_file);
546 } else {
547 return;
548 }
549 return 1;
550}
551
552sub output_xml_header {
553 my $self = shift (@_);
554 my ($handle) = @_;
555
556 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
557
558 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
559 print $handle "<Archive>\n";
560}
561
562sub output_xml_footer {
563 my $self = shift (@_);
564 my ($handle) = @_;
565
566 print $handle "</Archive>\n";
567}
568
569sub output_txt_xml_header{
570 my $self = shift (@_);
571 my ($handle) = @_;
572 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
573 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
574}
575
576sub output_txt_xml_footer{
577 my $self = shift(@_);
578 my ($handle) = @_;
579 print $handle "<the end of the file>\n";
580}
581
582sub output_mets_xml_header(){
583 my $self = shift(@_);
584 my ($handle, $OID) = @_;
585
586 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
587 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
588 print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
589}
590
591sub output_mets_xml_footer() {
592 my $self = shift(@_);
593 my ($handle) = @_;
594 print $handle '</mets:mets>' . "\n";
595}
596
597sub output_dc_xml_header(){
598 my $self = shift(@_);
599 my ($handle, $OID) = @_;
600
601 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
602# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
603 print $handle '<dublin_core>' . "\n";
604}
605
606sub output_dc_xml_footer() {
607 my $self = shift(@_);
608 my ($handle) = @_;
609 print $handle '</dublin_core>' . "\n";
610}
6111;
Note: See TracBrowser for help on using the repository browser.