source: trunk/gsdl/perllib/docsave.pm@ 8794

Last change on this file since 8794 was 8716, checked in by kjdon, 20 years ago

added some changes made by Emanuel Dejanu (Simple Words)

  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use expinfo;
38use docproc;
39use util;
40
41
42sub BEGIN {
43 @docsave::ISA = ('docproc');
44}
45
46sub new {
47 my ($class, $collection, $info, $verbosity,
48 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
49 my $self = new docproc ();
50
51 $groupsize=1 unless defined $groupsize;
52 $self->{'collection'} = $collection;
53 if ($service eq "import"){
54 $self->{'archive_info'} = $info;
55 } elsif ($service eq "export"){
56 $self->{'export_info'} = $info;
57 } else {
58 return;
59 }
60
61 $self->{'verbosity'} = $verbosity;
62 $self->{'gzip'} = $gzip;
63 $self->{'keepimportstructure'} = 0;
64 $self->{'groupsize'} = $groupsize;
65 $self->{'gs_count'} = 0;
66
67 $self->{'outhandle'} = 'STDERR';
68 $self->{'outhandle'} = $outhandle if defined $outhandle;
69 $self->{'service'} = $service;
70 $self->{'saveas'} = $saveas;
71
72 # set a default for the archive directory
73 if ($service eq "import"){
74 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
75 } elsif ($service eq "export") {
76 # set a default for the export directory
77 $self->{'export_dir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "export");
78 } else {
79 return;
80 }
81 $self->{'sortmeta'} = undef;
82
83 return bless $self, $class;
84}
85
86sub setarchivedir {
87 my $self = shift (@_);
88 my ($archive_dir) = @_;
89
90 &util::mk_all_dir ($archive_dir) unless -e $archive_dir;
91 $self->{'archive_dir'} = $archive_dir;
92}
93
94sub setexportdir {
95 my $self = shift (@_);
96 my ($export_dir) = @_;
97
98 &util::mk_all_dir ($export_dir) unless -e $export_dir;
99 $self->{'export_dir'} = $export_dir;
100}
101
102sub set_sortmeta {
103 my $self = shift (@_);
104 my ($sortmeta) = @_;
105
106 $self->{'sortmeta'} = $sortmeta;
107 }
108
109sub process {
110 my $self = shift (@_);
111 my ($doc_obj) = @_;
112
113 my $outhandle = $self->{'outhandle'};
114 my $service = $self->{'service'} || "import";
115
116 # Define the SaveAs Type
117 my $save_as = $self->{'saveas'} || "GA";
118 my $collection = $self->{'collection'};
119
120 if ($self->{'groupsize'} > 1) {
121 $self->group_process ($doc_obj);
122 return;
123 }
124
125 my $OID = $doc_obj->get_OID();
126 $OID = "NULL" unless defined $OID;
127
128 # get document's directory
129 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
130
131 # groupsize is 1 (i.e. one document per XML file) so sortmeta
132 # may be used
133
134 if ($service eq "import") {
135 my $archive_info = $self->{'archive_info'};
136 } elsif ($service eq "export") {
137 my $export_info = $self->{'export_info'};
138 } else {
139 return;
140 }
141
142 # copy all the associated files, add this information as metadata
143 # to the document
144 if ($service eq "export" && $save_as eq "DSpace") {
145 # open contents file
146 my $doc_contents_file
147 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "contents");
148
149 if (!open(OUTDOC_EXPORT_CONTENTS,">$doc_contents_file")){
150 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
151 return;
152 }
153 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_EXPORT_CONTENTS');
154 } else {
155 $self->process_assoc_files ($doc_obj, $doc_dir, '');
156 }
157
158 my $doc_file;
159 my $doc_mets_file;
160 my $doc_txt_file;
161 my $short_doc_file;
162
163 #Import collection to GS2 in GS Archive format and METs format
164 if ($service eq "import") {
165 my $doc_file
166 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
167
168 #***define doctxt.xml file
169 my $doc_txt_file
170 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
171
172 my $import_working_dir
173 =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
174
175 #***define docmets.xml file
176 my $doc_mets_file
177 = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
178
179 if ($save_as eq "GA") {
180 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
181 } elsif ($save_as eq "METS") {
182 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
183 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
184 } else {
185 return;
186 }
187
188 if ($save_as eq "GA") {
189 if (!open (OUTDOC, ">$doc_file")) {
190 print $outhandle "docsave::process could not write to file $doc_file\n";
191 return;
192 }
193 # save this document
194 $self->output_xml_header('docsave::OUTDOC');
195 $doc_obj->output_section('docsave::OUTDOC',
196 $doc_obj->get_top_section());
197 $self->output_xml_footer('docsave::OUTDOC');
198
199 close OUTDOC;
200 } elsif ($save_as eq "METS") {
201 # save the document without metadata:doctxt.xml
202
203 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
204 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
205 return;
206 }
207
208 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
209 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
210 #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
211
212 # Convert doctxt.xml file to docmets.xml
213 if (!open(OUTDOC_METS,">$doc_mets_file")){
214 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
215 return;
216 }
217
218 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
219 $doc_obj->output_mets_section('docsave::OUTDOC_METS',
220 $doc_obj->get_top_section(),
221 $import_working_dir);
222 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
223
224 close OUTDOC_TXT;
225 close OUTDOC_METS;
226 } else { # save_as isn't GA or METS
227 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
228 return;
229 }
230 }
231
232 ## Export the collection to METs format or DSpace Archive Format into the export directory
233 if ($service eq "export") {
234 my $doc_dc_file;
235 my $doc_contents_file;
236
237 my $export_working_dir
238 =&util::filename_cat ($self->{'export_dir'}, $doc_dir);
239
240 if ($save_as eq "METS") {
241 $doc_mets_file
242 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "docmets.xml");
243
244 $doc_txt_file
245 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "doctxt.xml");
246
247 if (!open(OUTDOC_EXPORT_TXT, ">$doc_txt_file")){
248 print $outhandle "docsave::process could not write TXT to file $doc_txt_file\n";
249 return;
250 }
251
252 $self->output_txt_xml_header('docsave::OUTDOC_EXPORT_TXT');
253 $doc_obj->output_txt_section('docsave::OUTDOC_EXPORT_TXT', $doc_obj->get_top_section());
254
255 if (!open(OUTDOC_EXPORT_METS,">$doc_mets_file")){
256 print $outhandle "docsave::process could not write METS format to file $doc_mets_file\n";
257 return;
258 }
259 $self->output_mets_xml_header('docsave::OUTDOC_EXPORT_METS', $OID);
260 $doc_obj->output_mets_section('docsave::OUTDOC_EXPORT_METS',$doc_obj->get_top_section(), $export_working_dir);
261 $self->output_mets_xml_footer('docsave::OUTDOC_EXPORT_METS');
262
263 close OUTDOC_EXPORT_TXT;
264 close OUTDOC_EXPORT_METS;
265 } elsif ($save_as eq "DSpace") {
266
267 # Generate dublin_core.xml file
268 $doc_dc_file
269 = &util::filename_cat ($self->{'export_dir'},$doc_dir, "dublin_core.xml");
270
271 if (!open(OUTDOC_EXPORT_DC,">$doc_dc_file")){
272 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
273 return;
274 }
275
276 $self->output_dc_xml_header('docsave::OUTDOC_EXPORT_DC', $OID);
277 $doc_obj->output_dc_section('docsave::OUTDOC_EXPORT_DC',$doc_obj->get_top_section(), $export_working_dir);
278 $self->output_dc_xml_footer('docsave::OUTDOC_EXPORT_DC');
279
280 close OUTDOC_EXPORT_DC;
281 close OUTDOC_EXPORT_CONTENTS;
282 } else { # save_as isn't METS or DSpace
283 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
284 return;
285 }
286
287 if ($save_as eq "METS") {
288 $short_doc_file = util::filename_cat ($doc_dir, "docmets.xml");
289 } elsif ($save_as eq "DSpace") {
290 #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
291 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
292 } else {
293 return;
294 }
295
296 }
297 #save for later (for close_file_output())
298 $self->{'short_doc_file'} = $short_doc_file;
299
300 if ($self->{'gzip'}) {
301 my $doc_file = $self->{'gs_filename'};
302 `gzip $doc_file`;
303 $doc_file .= ".gz";
304 $short_doc_file .= ".gz";
305 if (!-e $doc_file) {
306 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
307 return 0;
308 }
309 }
310
311 # do the sortmeta thing
312 my ($metadata);
313 if (defined ($self->{'sortmeta'})) {
314 $metadata = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
315 $self->{'sortmeta'});
316 }
317
318 # store reference in the archive_info and export_info
319 if ($service eq "export") {
320 $self->{'export_info'}->add_info($OID, $short_doc_file, $metadata);
321 } elsif ($service eq "import") {
322 $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
323 }
324}
325
326
327sub group_process {
328 my $self = shift (@_);
329 my ($doc_obj) = @_;
330
331 my $outhandle = $self->{'outhandle'};
332
333 my $OID = $doc_obj->get_OID();
334 $OID = "NULL" unless defined $OID;
335
336 my $groupsize = $self->{'groupsize'};
337 my $gs_count = $self->{'gs_count'};
338 my $open_new_file = (($gs_count % $groupsize)==0);
339
340 # opening a new file, or document has assoicated files => directory needed
341 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
342
343 # get document's directory
344 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
345
346 # copy all the associated files, add this information as metadata
347 # to the document
348 $self->process_assoc_files ($doc_obj, $doc_dir);
349
350
351 if ($open_new_file) {
352 # only if opening new file
353 my $doc_file
354 = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
355 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
356
357 if ($gs_count>0)
358 {
359 return if (!$self->close_file_output());
360 }
361
362 if (!open (OUTDOC, ">$doc_file")) {
363 print $outhandle "docsave::group_process could not write to file $doc_file\n";
364 return;
365 }
366 $self->{'gs_filename'} = $doc_file;
367 $self->{'gs_short_filename'} = $short_doc_file;
368 $self->{'gs_OID'} = $OID;
369
370 $self->output_xml_header('docsave::OUTDOC');
371 }
372 }
373
374 # save this document
375 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
376
377 $self->{'gs_count'}++;
378}
379
380sub get_doc_dir {
381 my $self = shift (@_);
382 my ($OID, $source_filename) = @_;
383 my $doc_info;
384 my $doc_dir = '';
385 my $service = $self-> {'service'};
386 my $working_dir;
387 my $working_info;
388
389 if ($service eq "import") {
390 $doc_info = $self->{'archive_info'}->get_info($OID);
391 $working_dir = $self->{'archive_dir'};
392 $working_info = $self->{'archive_info'};
393 } elsif ($service eq "export") {
394 $doc_info =$self->{'export_info'}->get_info($OID);
395 $working_dir = $self->{'export_dir'};
396 $working_info = $self->{'export_info'};
397 } else {
398 return;
399 }
400 if (defined $doc_info && scalar(@$doc_info) >= 1) {
401 # this OID already has an assigned directory, use the
402 # same one.
403 $doc_dir = $doc_info->[0];
404 $doc_dir =~ s/\/?doc\.xml(\.gz)?$//;
405 } elsif ($self->{'keepimportstructure'}) {
406 $source_filename = &File::Basename::dirname($source_filename);
407 $source_filename =~ s/[\\\/]+/\//g;
408 $source_filename =~ s/\/$//;
409
410
411 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
412 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
413
414 }
415 if ($doc_dir eq "") {
416 # have to get a new document directory
417 my $doc_dir_rest = $OID;
418 my $doc_dir_num = 0;
419 do {
420 $doc_dir .= "/" if $doc_dir_num > 0;
421 if ($doc_dir_rest =~ s/^(.{1,8})//) {
422 $doc_dir .= $1;
423 $doc_dir_num++;
424 }
425 } while ($doc_dir_rest ne "" &&
426 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
427 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
428
429 $doc_dir .= ".dir";
430 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
431 }
432 return $doc_dir;
433}
434
435sub process_assoc_files {
436 my $self = shift (@_);
437 my ($doc_obj, $doc_dir, $handle) = @_;
438
439 my $outhandle = $self->{'outhandle'};
440
441 my @assoc_files = ();
442 my $filename;;
443 my $working_dir;
444 my $service = $self->{'service'};
445 my $save_as = $self->{'saveas'};
446
447 if ($service eq "import") {
448 $working_dir = $self->{'archive_dir'};
449 } elsif ($service eq "export"){
450 $working_dir = $self->{'export_dir'};
451 } else {
452 return;
453 }
454 $doc_obj->get_source_filename()=~ /\/[^\/\\]$/;
455
456 if ($save_as eq "DSpace") {
457 print $handle "$1\n";
458 $filename = &util::filename_cat($working_dir, $doc_dir, $1);
459 &util::hard_link ($doc_obj->get_source_filename(), $filename);
460 }
461
462 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
463 my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/;
464 $dir = "" unless defined $dir;
465
466 # Store the associated file to the "contents" file
467 if ($save_as eq "DSpace") {
468 print $handle "$assoc_file->[1]\n";
469 }
470
471 if (-e $assoc_file->[0]) {
472 $filename = &util::filename_cat($working_dir, $doc_dir, $afile);
473
474 &util::hard_link ($assoc_file->[0], $filename);
475
476 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
477 "gsdlassocfile",
478 "$afile:$assoc_file->[2]:$dir");
479 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
480 "assocfilepath",
481 "$doc_dir");
482 } elsif ($self->{'verbosity'} > 2) {
483 print $outhandle "docsave::process couldn't copy the associated file " .
484 "$assoc_file->[0] to $afile\n";
485 }
486 }
487}
488
489
490sub close_file_output
491{
492 my ($self) = @_;
493 my $service =$self->{'service'};
494
495 # make sure that the handle has been opened - it won't be if we failed
496 # to import any documents...
497 if (defined(fileno(docsave::OUTDOC))) {
498 $self->output_xml_footer('docsave::OUTDOC');
499 close OUTDOC;
500 }
501
502 my $OID = $self->{'gs_OID'};
503 my $short_doc_file;
504 # can we use 'short_doc_file' for GA too?
505 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
506 $short_doc_file=$self->{'short_doc_file'};
507 } elsif ($self->{'saveas'} eq "GA") { # "GA"
508 $short_doc_file=$self->{'gs_short_filename'};
509 } else { # "DSpace"
510 }
511
512 if ($self->{'gzip'}) {
513 my $doc_file = $self->{'gs_filename'};
514 `gzip $doc_file`;
515 $doc_file .= ".gz";
516 $short_doc_file .= ".gz";
517 if (!-e $doc_file) {
518 my $outhandle = $self->{'outhandle'};
519 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
520 return 0;
521 }
522 }
523
524 # store reference in the archive_info and export_infor
525 if ($service eq "import") {
526 $self->{'archive_info'}->add_info($OID, $short_doc_file);
527 } elsif ($service eq "export") {
528 $self->{'export_info'}->add_info($OID, $short_doc_file);
529 } else {
530 return;
531 }
532 return 1;
533}
534
535sub output_xml_header {
536 my $self = shift (@_);
537 my ($handle) = @_;
538
539 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
540
541 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
542 print $handle "<Archive>\n";
543}
544
545sub output_xml_footer {
546 my $self = shift (@_);
547 my ($handle) = @_;
548
549 print $handle "</Archive>\n";
550}
551
552sub output_txt_xml_header{
553 my $self = shift (@_);
554 my ($handle) = @_;
555 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
556 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
557}
558
559sub output_txt_xml_footer{
560 my $self = shift(@_);
561 my ($handle) = @_;
562 print $handle "<the end of the file>\n";
563}
564
565sub output_mets_xml_header(){
566 my $self = shift(@_);
567 my ($handle, $OID) = @_;
568
569 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
570 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
571 print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
572}
573
574sub output_mets_xml_footer() {
575 my $self = shift(@_);
576 my ($handle) = @_;
577 print $handle '</mets:mets>' . "\n";
578}
579
580sub output_dc_xml_header(){
581 my $self = shift(@_);
582 my ($handle, $OID) = @_;
583
584 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
585# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
586 print $handle '<dublin_core>' . "\n";
587}
588
589sub output_dc_xml_footer() {
590 my $self = shift(@_);
591 my ($handle) = @_;
592 print $handle '</dublin_core>' . "\n";
593}
5941;
Note: See TracBrowser for help on using the repository browser.