source: trunk/gsdl/perllib/docsave.pm@ 10465

Last change on this file since 10465 was 10431, checked in by chi, 19 years ago

In process_assoc_files, for some reasons, there is a "/" in front of full path name of assoicated
image file.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use docproc;
38use util;
39
40
41sub BEGIN {
42 @docsave::ISA = ('docproc');
43}
44
45
46sub calcoutputdir
47{
48 my ($service,$collectdir,$outhandle) = @_;
49
50 my $output_dir = undef;
51
52 if (($service eq "import") || ($service eq "unbuild")) {
53 $output_dir = &util::filename_cat ($collectdir, "archives");
54 }
55 elsif ($service eq "export") {
56 $output_dir = &util::filename_cat($collectdir, "export");
57 }
58 else {
59 print $outhandle "docsave::calcoutputdir did not recognise service ";
60 print $outhandle " '$service'. No output directory set.\n";
61 }
62
63 return $output_dir;
64}
65
66
67sub new {
68 my ($class, $collection, $info, $verbosity,
69 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
70 my $self = new docproc ();
71
72 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
73
74 $outhandle = 'STDERR' unless (defined $outhandle);
75 $service = "import" unless (defined $service);
76 $saveas = "GA" unless (defined $saveas);
77 $groupsize = 1 unless (defined $groupsize);
78
79 $self->{'collection'} = $collection;
80
81 $self->{'output_info'} = $info;
82 $self->{'output_dir'} = undef;
83
84 $self->{'verbosity'} = $verbosity;
85 $self->{'gzip'} = $gzip;
86 $self->{'keepimportstructure'} = 0;
87 $self->{'groupsize'} = $groupsize;
88 $self->{'gs_count'} = 0;
89
90 $self->{'outhandle'} = $outhandle;
91 $self->{'service'} = $service;
92 $self->{'saveas'} = $saveas;
93
94 $self->{'sortmeta'} = undef;
95
96 return bless $self, $class;
97}
98
99
100
101sub setoutputdir
102{
103 my $self = shift @_;
104 my ($output_dir) = @_;
105
106 &util::mk_all_dir ($output_dir) unless -e $output_dir;
107
108 $self->{'output_dir'} = $output_dir;
109}
110
111sub getoutputinfo
112{
113 my $self = shift (@_);
114
115 return $self->{'output_info'};
116}
117
118sub getoutputdir
119{
120 my $self = shift (@_);
121
122 return $self->{'output_dir'};
123}
124
125
126sub set_sortmeta {
127 my $self = shift (@_);
128 my ($sortmeta, $removeprefix, $removesuffix) = @_;
129
130 $self->{'sortmeta'} = $sortmeta;
131 if (defined ($removeprefix) && $removeprefix ) {
132 $removeprefix =~ s/^\^//; # don't need a leading ^
133 $self->{'removeprefix'} = $removeprefix;
134 }
135 if (defined ($removesuffix) && $removesuffix) {
136 $removesuffix =~ s/\$$//; # don't need a trailing $
137 $self->{'removesuffix'} = $removesuffix;
138 }
139}
140
141sub process {
142 my $self = shift (@_);
143 my ($doc_obj) = @_;
144
145 my $outhandle = $self->{'outhandle'};
146 my $service = $self->{'service'};
147
148 # Define the SaveAs Type
149 my $save_as = $self->{'saveas'};
150 my $collection = $self->{'collection'};
151
152 # set the lastmodified time before we write it out.
153 $doc_obj->set_lastmodified();
154
155 if ($self->{'groupsize'} > 1) {
156 $self->group_process ($doc_obj);
157 return;
158 }
159
160 my $OID = $doc_obj->get_OID();
161 $OID = "NULL" unless defined $OID;
162
163 my $top_section = $doc_obj->get_top_section();
164
165 # get document's directory
166 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
167
168 # groupsize is 1 (i.e. one document per XML file) so sortmeta
169 # may be used
170
171 my $output_info = $self->getoutputinfo();
172 return if (!defined $output_info);
173
174 my $output_dir = $self->getoutputdir();
175 my $working_dir = &util::filename_cat ($output_dir, $doc_dir);
176
177 # copy all the associated files, add this information as metadata
178 # to the document
179 if ($save_as eq "DSpace") {
180
181 # Genereate handle file
182 # (Note: this section of code would benefit from being restructured)
183 my $doc_handle_file = &util::filename_cat ($working_dir, "handle");
184
185 my $env_hp = $ENV{'DSPACE_HANDLE_PREFIX'};
186 my $handle_prefix = (defined $env_hp) ? $env_hp : "123456789";
187
188 if (!open(OUTDOC_HANDLE,">$doc_handle_file")){
189 print $outhandle "docsave::process could not write collection handle to file $doc_handle_file\n";
190 return;
191 }
192
193 my ($handle) = ($doc_dir =~ m/^(.*)\.dir$/);
194 print OUTDOC_HANDLE "$handle_prefix/$handle\n";
195
196 close OUTDOC_HANDLE;
197
198 # Generate contents file
199 my $doc_contents_file = &util::filename_cat ($working_dir, "contents");
200
201 if (!open(OUTDOC_CONTENTS,">$doc_contents_file")){
202 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
203 return;
204 }
205 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_CONTENTS');
206
207 close OUTDOC_CONTENTS;
208
209 } else {
210 $self->process_assoc_files ($doc_obj, $doc_dir, '');
211 }
212
213 # Save the document in the requested 'save_as' format
214
215 if ($save_as eq "GA") {
216
217 my $doc_file = &util::filename_cat ($working_dir, "doc.xml");
218
219 if (!open (OUTDOC, ">$doc_file")) {
220 print $outhandle "docsave::process could not write to file $doc_file\n";
221 return;
222 }
223
224 # save this document
225 $self->output_xml_header('docsave::OUTDOC');
226 $doc_obj->output_section('docsave::OUTDOC',$top_section);
227 $self->output_xml_footer('docsave::OUTDOC');
228
229 close OUTDOC;
230 }
231 elsif ($save_as eq "METS") {
232
233 my $doc_txt_file = &util::filename_cat ($working_dir,"doctxt.xml");
234
235 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
236 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
237 return;
238 }
239
240 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
241 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $top_section);
242 $self->output_txt_xml_footer('docsave::OUTDOC_TXT');
243
244 close OUTDOC_TXT;
245
246 # Now save the document with metadata and text structure to docmets.xml
247
248 my $doc_mets_file = &util::filename_cat ($working_dir, "docmets.xml");
249
250 my $doc_title = $doc_obj->get_metadata_element($top_section,"dc.Title");
251 if (!defined $doc_title) {
252 $doc_title = $doc_obj->get_metadata_element($top_section,"Title");
253 }
254
255 if (!open(OUTDOC_METS,">$doc_mets_file")){
256 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
257 return;
258 }
259
260 my $saveas_version = $self->{'saveas_version'};
261 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID, $doc_title);
262 $doc_obj->output_mets_section('docsave::OUTDOC_METS',$top_section,$saveas_version,$working_dir);
263 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
264
265 close OUTDOC_METS;
266 }
267 elsif ($save_as eq "DSpace") {
268
269 # Generate dublin_core.xml file
270 my $doc_dc_file = &util::filename_cat ($working_dir, "dublin_core.xml");
271
272 if (!open(OUTDOC_DC,">$doc_dc_file")){
273 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
274 return;
275 }
276
277 my $saveas_version = $self->{'saveas_version'};
278
279 $self->output_dc_xml_header('docsave::OUTDOC_DC', $OID);
280 $doc_obj->output_dc_section('docsave::OUTDOC_DC',$top_section);
281 $self->output_dc_xml_footer('docsave::OUTDOC_DC');
282
283 close OUTDOC_DC;
284 } else { # save_as isn't one of the recognised types
285 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
286 return;
287 }
288
289
290 my $short_doc_file;
291
292 if ($save_as eq "GA") {
293 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
294 } elsif ($save_as eq "METS") {
295 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
296 } elsif ($save_as eq "DSpace") {
297 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
298 } else {
299 return;
300 }
301
302 #save for later (for close_file_output())
303 $self->{'short_doc_file'} = $short_doc_file;
304
305 if ($self->{'gzip'}) {
306 my $doc_file = $self->{'gs_filename'};
307 `gzip $doc_file`;
308 $doc_file .= ".gz";
309 $short_doc_file .= ".gz";
310 if (!-e $doc_file) {
311 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
312 return 0;
313 }
314 }
315
316 # do the sortmeta thing
317 my ($metadata);
318 if (defined ($self->{'sortmeta'})) {
319 $metadata = $doc_obj->get_metadata_element($top_section,$self->{'sortmeta'});
320 }
321 if (defined ($metadata) && $metadata) {
322 # do remove prefix/suffix
323 if (defined($self->{'removeprefix'})) {
324 $metadata =~ s/^$self->{'removeprefix'}//;
325 }
326 if (defined($self->{'removesuffix'})) {
327 $metadata =~ s/$self->{'removesuffix'}$//;
328 }
329 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
330 }
331
332 # store reference in the output_info
333 $output_info->add_info($OID, $short_doc_file, undef, $metadata);
334}
335
336
337sub group_process {
338 my $self = shift (@_);
339 my ($doc_obj) = @_;
340
341 my $outhandle = $self->{'outhandle'};
342
343 my $OID = $doc_obj->get_OID();
344 $OID = "NULL" unless defined $OID;
345
346 my $groupsize = $self->{'groupsize'};
347 my $gs_count = $self->{'gs_count'};
348 my $open_new_file = (($gs_count % $groupsize)==0);
349
350 # opening a new file, or document has assoicated files => directory needed
351 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
352
353 # get document's directory
354 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
355
356 # copy all the associated files, add this information as metadata
357 # to the document
358 $self->process_assoc_files ($doc_obj, $doc_dir);
359
360
361 if ($open_new_file) {
362 # only if opening new file
363 my $output_dir = $self->getoutputdir();
364 my $doc_file
365 = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
366 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
367
368 if ($gs_count>0)
369 {
370 return if (!$self->close_file_output());
371 }
372
373 if (!open (OUTDOC, ">$doc_file")) {
374 print $outhandle "docsave::group_process could not write to file $doc_file\n";
375 return;
376 }
377 $self->{'gs_filename'} = $doc_file;
378 $self->{'gs_short_filename'} = $short_doc_file;
379 $self->{'gs_OID'} = $OID;
380
381 $self->output_xml_header('docsave::OUTDOC');
382 }
383 }
384
385 # save this document
386 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
387
388 $self->{'gs_count'}++;
389}
390
391sub get_doc_dir {
392 my $self = shift (@_);
393 my ($OID, $source_filename) = @_;
394
395 my $service = $self-> {'service'};
396
397 my $working_dir = $self->getoutputdir();
398 my $working_info = $self->getoutputinfo();
399 return if (!defined $working_info);
400
401 my $doc_info = $working_info->get_info($OID);
402 my $doc_dir = '';
403
404 if (defined $doc_info && scalar(@$doc_info) >= 1) {
405 # this OID already has an assigned directory, use the
406 # same one.
407 $doc_dir = $doc_info->[0];
408 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
409 } elsif ($self->{'keepimportstructure'}) {
410 $source_filename = &File::Basename::dirname($source_filename);
411 $source_filename =~ s/[\\\/]+/\//g;
412 $source_filename =~ s/\/$//;
413
414
415 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
416 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
417
418 }
419 if ($doc_dir eq "") {
420 # have to get a new document directory
421
422 if (($service eq "import") || ($service eq "unbuild")) {
423 my $doc_dir_rest = $OID;
424 my $doc_dir_num = 0;
425
426 do {
427 $doc_dir .= "/" if $doc_dir_num > 0;
428 if ($doc_dir_rest =~ s/^(.{1,8})//) {
429 $doc_dir .= $1;
430 $doc_dir_num++;
431 }
432 } while ($doc_dir_rest ne "" &&
433 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
434 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
435 }
436 else {
437 # Export formats such as DSpace need the directory structure to
438 # be flat. This is simple to arrange (set 'doc_dir' to be the
439 # documents OID) but breaks Windows 3.1 file system compliance.
440 # Such a loss is not a big thing in this situation as such
441 # systems don't run on Windows 3.1 anyway.
442
443 $doc_dir = $OID;
444 }
445
446
447 $doc_dir .= ".dir";
448 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
449 }
450 return $doc_dir;
451}
452
453sub process_assoc_files {
454 my $self = shift (@_);
455 my ($doc_obj, $doc_dir, $handle) = @_;
456
457 my $outhandle = $self->{'outhandle'};
458 my $service = $self->{'service'};
459 my $save_as = $self->{'saveas'};
460
461 my $output_dir = $self->getoutputdir();
462 return if (!defined $output_dir);
463
464 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
465
466 my @assoc_files = ();
467 my $filename;;
468
469 my $source_filename = $doc_obj->get_source_filename();
470
471 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
472
473 if (defined $collect_dir) {
474 my $dirsep_regexp = &util::get_os_dirsep();
475
476 if ($collect_dir !~ /$dirsep_regexp$/) {
477 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
478 }
479
480 # This test is never going to fail on Windows -- is this a problem?
481 if ($source_filename !~ /^$dirsep_regexp/) {
482 $source_filename = &util::filename_cat($collect_dir, $source_filename);
483 }
484 }
485
486
487 if ($save_as eq "DSpace") {
488 my ($tail_filename) = ($source_filename =~ m/\/([^\/\\]*)$/);
489
490 print $handle "$tail_filename\n";
491
492 $filename = &util::filename_cat($working_dir, $tail_filename);
493 &util::hard_link ($source_filename, $filename);
494 }
495
496 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
497 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
498 $dir = "" unless defined $dir;
499
500
501 my $real_filename = $assoc_file_rec->[0];
502 # for some reasons the image associate file has / before the full path
503 $real_filename =~ s/^\\(.*)/$1/i;
504 if (-e $real_filename) {
505
506
507 if ($save_as eq "DSpace") {
508 if ($real_filename =~ m/$source_filename$/) {
509 next;
510 }
511 else {
512 my $bundle = "bundle:ORIGINAL";
513
514 if ($afile =~ m/^thumbnail\./) {
515 $bundle = "bundle:THUMBNAIL";
516 }
517
518 # Store the associated file to the "contents" file
519 print $handle "$assoc_file_rec->[1]\t$bundle\n";
520 }
521 }
522
523 $filename = &util::filename_cat($working_dir, $afile);
524
525
526 &util::hard_link ($real_filename, $filename);
527
528 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
529 "gsdlassocfile",
530 "$afile:$assoc_file_rec->[2]:$dir");
531 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
532 "assocfilepath",
533 "$doc_dir");
534 } elsif ($self->{'verbosity'} > 2) {
535 print $outhandle "docsave::process couldn't copy the associated file " .
536 "$real_filename to $afile\n";
537 }
538 }
539}
540
541
542sub close_file_output
543{
544 my ($self) = @_;
545 my $service =$self->{'service'};
546
547 # make sure that the handle has been opened - it won't be if we failed
548 # to import any documents...
549 if (defined(fileno(docsave::OUTDOC))) {
550 $self->output_xml_footer('docsave::OUTDOC');
551 close OUTDOC;
552 }
553
554 my $OID = $self->{'gs_OID'};
555 my $short_doc_file;
556 # can we use 'short_doc_file' for GA too?
557 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
558 $short_doc_file=$self->{'short_doc_file'};
559 } elsif ($self->{'saveas'} eq "GA") { # "GA"
560 $short_doc_file=$self->{'gs_short_filename'};
561 } else { # "DSpace"
562 }
563
564 if ($self->{'gzip'}) {
565 my $doc_file = $self->{'gs_filename'};
566 `gzip $doc_file`;
567 $doc_file .= ".gz";
568 $short_doc_file .= ".gz";
569 if (!-e $doc_file) {
570 my $outhandle = $self->{'outhandle'};
571 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
572 return 0;
573 }
574 }
575
576 # store reference in output_info
577 my $output_info = $self->getoutputinfo();
578 return 0 if (!defined $output_info);
579 $output_info->add_info($OID, $short_doc_file, undef, undef);
580
581 return 1;
582}
583
584sub output_xml_header {
585 my $self = shift (@_);
586 my ($handle) = @_;
587
588 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
589
590 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
591 print $handle "<Archive>\n";
592}
593
594sub output_xml_footer {
595 my $self = shift (@_);
596 my ($handle) = @_;
597
598 print $handle "</Archive>\n";
599}
600
601sub output_txt_xml_header{
602 my $self = shift (@_);
603 my ($handle) = @_;
604 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
605 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
606}
607
608sub output_txt_xml_footer{
609 my $self = shift(@_);
610 my ($handle) = @_;
611 # Nothing needs to be output at present
612}
613
614sub output_mets_xml_header(){
615 my $self = shift(@_);
616 my ($handle, $OID, $doc_title) = @_;
617
618 my $version = $self->{'saveas_version'};
619
620 my $extra_attr = "";
621 if ($version eq "fedora") {
622 my $fnamespace = $ENV{'FEDORA_PID_NAMESPACE'};
623 my $oid_namespace = (defined $fnamespace) ? $fnamespace : "test";
624
625 $extra_attr = "OBJID=\"$oid_namespace:$OID\" TYPE=\"FedoraObject\" LABEL=\"$doc_title\"";
626 }
627 else {
628 # Greenstone METS profile
629 $extra_attr = "OBJID=\"$OID:2\"";
630 }
631
632
633 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
634 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
635 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
636 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
637 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
638 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
639 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
640 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
641 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
642 print $handle " $extra_attr>\n";
643
644 if ($version eq "fedora") {
645 print $handle '<mets:metsHdr RECORDSTATUS="A"/>'. "\n"; # A = active
646 }
647
648}
649
650sub output_mets_xml_footer() {
651 my $self = shift(@_);
652 my ($handle) = @_;
653 print $handle '</mets:mets>' . "\n";
654}
655
656sub output_dc_xml_header(){
657 my $self = shift(@_);
658 my ($handle, $OID) = @_;
659
660 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
661# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">'."\n";
662 print $handle '<dublin_core>' . "\n";
663}
664
665sub output_dc_xml_footer() {
666 my $self = shift(@_);
667 my ($handle) = @_;
668 print $handle '</dublin_core>' . "\n";
669}
6701;
Note: See TracBrowser for help on using the repository browser.