source: trunk/gsdl/perllib/docsave.pm@ 11089

Last change on this file since 11089 was 10538, checked in by kjdon, 19 years ago

add assocfilepath metadata always, even if no associated files. with lucene, all doc types have their doc.xml as a kind of assocfile.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use docproc;
38use util;
39
40
41sub BEGIN {
42 @docsave::ISA = ('docproc');
43}
44
45
46sub calcoutputdir
47{
48 my ($service,$collectdir,$outhandle) = @_;
49
50 my $output_dir = undef;
51
52 if (($service eq "import") || ($service eq "unbuild")) {
53 $output_dir = &util::filename_cat ($collectdir, "archives");
54 }
55 elsif ($service eq "export") {
56 $output_dir = &util::filename_cat($collectdir, "export");
57 }
58 else {
59 print $outhandle "docsave::calcoutputdir did not recognise service ";
60 print $outhandle " '$service'. No output directory set.\n";
61 }
62
63 return $output_dir;
64}
65
66
67sub new {
68 my ($class, $collection, $info, $verbosity,
69 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
70 my $self = new docproc ();
71
72 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
73
74 $outhandle = 'STDERR' unless (defined $outhandle);
75 $service = "import" unless (defined $service);
76 $saveas = "GA" unless (defined $saveas);
77 $groupsize = 1 unless (defined $groupsize);
78
79 $self->{'collection'} = $collection;
80
81 $self->{'output_info'} = $info;
82 $self->{'output_dir'} = undef;
83
84 $self->{'verbosity'} = $verbosity;
85 $self->{'gzip'} = $gzip;
86 $self->{'keepimportstructure'} = 0;
87 $self->{'groupsize'} = $groupsize;
88 $self->{'gs_count'} = 0;
89
90 $self->{'outhandle'} = $outhandle;
91 $self->{'service'} = $service;
92 $self->{'saveas'} = $saveas;
93
94 $self->{'sortmeta'} = undef;
95
96 return bless $self, $class;
97}
98
99
100
101sub setoutputdir
102{
103 my $self = shift @_;
104 my ($output_dir) = @_;
105
106 &util::mk_all_dir ($output_dir) unless -e $output_dir;
107
108 $self->{'output_dir'} = $output_dir;
109}
110
111sub getoutputinfo
112{
113 my $self = shift (@_);
114
115 return $self->{'output_info'};
116}
117
118sub getoutputdir
119{
120 my $self = shift (@_);
121
122 return $self->{'output_dir'};
123}
124
125
126sub set_sortmeta {
127 my $self = shift (@_);
128 my ($sortmeta, $removeprefix, $removesuffix) = @_;
129
130 $self->{'sortmeta'} = $sortmeta;
131 if (defined ($removeprefix) && $removeprefix ) {
132 $removeprefix =~ s/^\^//; # don't need a leading ^
133 $self->{'removeprefix'} = $removeprefix;
134 }
135 if (defined ($removesuffix) && $removesuffix) {
136 $removesuffix =~ s/\$$//; # don't need a trailing $
137 $self->{'removesuffix'} = $removesuffix;
138 }
139}
140
141sub process {
142 my $self = shift (@_);
143 my ($doc_obj) = @_;
144
145 my $outhandle = $self->{'outhandle'};
146 my $service = $self->{'service'};
147
148 # Define the SaveAs Type
149 my $save_as = $self->{'saveas'};
150 my $collection = $self->{'collection'};
151
152 # set the lastmodified time before we write it out.
153 $doc_obj->set_lastmodified();
154
155 if ($self->{'groupsize'} > 1) {
156 $self->group_process ($doc_obj);
157 return;
158 }
159
160 my $OID = $doc_obj->get_OID();
161 $OID = "NULL" unless defined $OID;
162
163 my $top_section = $doc_obj->get_top_section();
164
165 # get document's directory
166 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
167
168 # groupsize is 1 (i.e. one document per XML file) so sortmeta
169 # may be used
170
171 my $output_info = $self->getoutputinfo();
172 return if (!defined $output_info);
173
174 my $output_dir = $self->getoutputdir();
175 my $working_dir = &util::filename_cat ($output_dir, $doc_dir);
176
177 # copy all the associated files, add this information as metadata
178 # to the document
179 if ($save_as eq "DSpace") {
180
181 # Genereate handle file
182 # (Note: this section of code would benefit from being restructured)
183 my $doc_handle_file = &util::filename_cat ($working_dir, "handle");
184
185 my $env_hp = $ENV{'DSPACE_HANDLE_PREFIX'};
186 my $handle_prefix = (defined $env_hp) ? $env_hp : "123456789";
187
188 if (!open(OUTDOC_HANDLE,">$doc_handle_file")){
189 print $outhandle "docsave::process could not write collection handle to file $doc_handle_file\n";
190 return;
191 }
192
193 my ($handle) = ($doc_dir =~ m/^(.*)\.dir$/);
194 print OUTDOC_HANDLE "$handle_prefix/$handle\n";
195
196 close OUTDOC_HANDLE;
197
198 # Generate contents file
199 my $doc_contents_file = &util::filename_cat ($working_dir, "contents");
200
201 if (!open(OUTDOC_CONTENTS,">$doc_contents_file")){
202 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
203 return;
204 }
205 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_CONTENTS');
206
207 close OUTDOC_CONTENTS;
208
209 } else {
210 $self->process_assoc_files ($doc_obj, $doc_dir, '');
211 }
212
213 # Save the document in the requested 'save_as' format
214
215 if ($save_as eq "GA") {
216
217 my $doc_file = &util::filename_cat ($working_dir, "doc.xml");
218
219 if (!open (OUTDOC, ">$doc_file")) {
220 print $outhandle "docsave::process could not write to file $doc_file\n";
221 return;
222 }
223
224 # save this document
225 $self->output_xml_header('docsave::OUTDOC');
226 $doc_obj->output_section('docsave::OUTDOC',$top_section);
227 $self->output_xml_footer('docsave::OUTDOC');
228
229 close OUTDOC;
230 }
231 elsif ($save_as eq "METS") {
232
233 my $doc_txt_file = &util::filename_cat ($working_dir,"doctxt.xml");
234
235 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
236 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
237 return;
238 }
239
240 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
241 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $top_section);
242 $self->output_txt_xml_footer('docsave::OUTDOC_TXT');
243
244 close OUTDOC_TXT;
245
246 # Now save the document with metadata and text structure to docmets.xml
247
248 my $doc_mets_file = &util::filename_cat ($working_dir, "docmets.xml");
249
250 my $doc_title = $doc_obj->get_metadata_element($top_section,"dc.Title");
251 if (!defined $doc_title) {
252 $doc_title = $doc_obj->get_metadata_element($top_section,"Title");
253 }
254
255 if (!open(OUTDOC_METS,">$doc_mets_file")){
256 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
257 return;
258 }
259
260 my $saveas_version = $self->{'saveas_version'};
261 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID, $doc_title);
262 $doc_obj->output_mets_section('docsave::OUTDOC_METS',$top_section,$saveas_version,$working_dir);
263 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
264
265 close OUTDOC_METS;
266 }
267 elsif ($save_as eq "DSpace") {
268
269 # Generate dublin_core.xml file
270 my $doc_dc_file = &util::filename_cat ($working_dir, "dublin_core.xml");
271
272 if (!open(OUTDOC_DC,">$doc_dc_file")){
273 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
274 return;
275 }
276
277 my $saveas_version = $self->{'saveas_version'};
278
279 $self->output_dc_xml_header('docsave::OUTDOC_DC', $OID);
280 $doc_obj->output_dc_section('docsave::OUTDOC_DC',$top_section);
281 $self->output_dc_xml_footer('docsave::OUTDOC_DC');
282
283 close OUTDOC_DC;
284 } else { # save_as isn't one of the recognised types
285 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
286 return;
287 }
288
289
290 my $short_doc_file;
291
292 if ($save_as eq "GA") {
293 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
294 } elsif ($save_as eq "METS") {
295 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
296 } elsif ($save_as eq "DSpace") {
297 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
298 } else {
299 return;
300 }
301
302 #save for later (for close_file_output())
303 $self->{'short_doc_file'} = $short_doc_file;
304
305 if ($self->{'gzip'}) {
306 my $doc_file = $self->{'gs_filename'};
307 `gzip $doc_file`;
308 $doc_file .= ".gz";
309 $short_doc_file .= ".gz";
310 if (!-e $doc_file) {
311 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
312 return 0;
313 }
314 }
315
316 # do the sortmeta thing
317 my ($metadata);
318 if (defined ($self->{'sortmeta'})) {
319 $metadata = $doc_obj->get_metadata_element($top_section,$self->{'sortmeta'});
320 }
321 if (defined ($metadata) && $metadata) {
322 # do remove prefix/suffix
323 if (defined($self->{'removeprefix'})) {
324 $metadata =~ s/^$self->{'removeprefix'}//;
325 }
326 if (defined($self->{'removesuffix'})) {
327 $metadata =~ s/$self->{'removesuffix'}$//;
328 }
329 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
330 }
331
332 # store reference in the output_info
333 $output_info->add_info($OID, $short_doc_file, undef, $metadata);
334}
335
336
337sub group_process {
338 my $self = shift (@_);
339 my ($doc_obj) = @_;
340
341 my $outhandle = $self->{'outhandle'};
342
343 my $OID = $doc_obj->get_OID();
344 $OID = "NULL" unless defined $OID;
345
346 my $groupsize = $self->{'groupsize'};
347 my $gs_count = $self->{'gs_count'};
348 my $open_new_file = (($gs_count % $groupsize)==0);
349
350 # opening a new file, or document has assoicated files => directory needed
351 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
352
353 # get document's directory
354 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
355
356 # copy all the associated files, add this information as metadata
357 # to the document
358 $self->process_assoc_files ($doc_obj, $doc_dir);
359
360
361 if ($open_new_file) {
362 # only if opening new file
363 my $output_dir = $self->getoutputdir();
364 my $doc_file
365 = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
366 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
367
368 if ($gs_count>0)
369 {
370 return if (!$self->close_file_output());
371 }
372
373 if (!open (OUTDOC, ">$doc_file")) {
374 print $outhandle "docsave::group_process could not write to file $doc_file\n";
375 return;
376 }
377 $self->{'gs_filename'} = $doc_file;
378 $self->{'gs_short_filename'} = $short_doc_file;
379 $self->{'gs_OID'} = $OID;
380
381 $self->output_xml_header('docsave::OUTDOC');
382 }
383 }
384
385 # save this document
386 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
387
388 $self->{'gs_count'}++;
389}
390
391sub get_doc_dir {
392 my $self = shift (@_);
393 my ($OID, $source_filename) = @_;
394
395 my $service = $self-> {'service'};
396
397 my $working_dir = $self->getoutputdir();
398 my $working_info = $self->getoutputinfo();
399 return if (!defined $working_info);
400
401 my $doc_info = $working_info->get_info($OID);
402 my $doc_dir = '';
403
404 if (defined $doc_info && scalar(@$doc_info) >= 1) {
405 # this OID already has an assigned directory, use the
406 # same one.
407 $doc_dir = $doc_info->[0];
408 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
409 } elsif ($self->{'keepimportstructure'}) {
410 $source_filename = &File::Basename::dirname($source_filename);
411 $source_filename =~ s/[\\\/]+/\//g;
412 $source_filename =~ s/\/$//;
413
414
415 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
416 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
417
418 }
419 if ($doc_dir eq "") {
420 # have to get a new document directory
421
422 if (($service eq "import") || ($service eq "unbuild")) {
423 my $doc_dir_rest = $OID;
424 my $doc_dir_num = 0;
425
426 do {
427 $doc_dir .= "/" if $doc_dir_num > 0;
428 if ($doc_dir_rest =~ s/^(.{1,8})//) {
429 $doc_dir .= $1;
430 $doc_dir_num++;
431 }
432 } while ($doc_dir_rest ne "" &&
433 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
434 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
435 }
436 else {
437 # Export formats such as DSpace need the directory structure to
438 # be flat. This is simple to arrange (set 'doc_dir' to be the
439 # documents OID) but breaks Windows 3.1 file system compliance.
440 # Such a loss is not a big thing in this situation as such
441 # systems don't run on Windows 3.1 anyway.
442
443 $doc_dir = $OID;
444 }
445
446
447 $doc_dir .= ".dir";
448 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
449 }
450 return $doc_dir;
451}
452
453sub process_assoc_files {
454 my $self = shift (@_);
455 my ($doc_obj, $doc_dir, $handle) = @_;
456
457 my $outhandle = $self->{'outhandle'};
458 my $service = $self->{'service'};
459 my $save_as = $self->{'saveas'};
460
461 my $output_dir = $self->getoutputdir();
462 return if (!defined $output_dir);
463
464 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
465
466 my @assoc_files = ();
467 my $filename;;
468
469 my $source_filename = $doc_obj->get_source_filename();
470
471 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
472
473 if (defined $collect_dir) {
474 my $dirsep_regexp = &util::get_os_dirsep();
475
476 if ($collect_dir !~ /$dirsep_regexp$/) {
477 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
478 }
479
480 # This test is never going to fail on Windows -- is this a problem?
481 if ($source_filename !~ /^$dirsep_regexp/) {
482 $source_filename = &util::filename_cat($collect_dir, $source_filename);
483 }
484 }
485
486
487 if ($save_as eq "DSpace") {
488 my ($tail_filename) = ($source_filename =~ m/\/([^\/\\]*)$/);
489
490 print $handle "$tail_filename\n";
491
492 $filename = &util::filename_cat($working_dir, $tail_filename);
493 &util::hard_link ($source_filename, $filename);
494 }
495
496 # set the assocfile path (even if we have no assoc files - need this for lucene)
497 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
498 "assocfilepath",
499 "$doc_dir");
500 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
501 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
502 $dir = "" unless defined $dir;
503
504
505 my $real_filename = $assoc_file_rec->[0];
506 # for some reasons the image associate file has / before the full path
507 $real_filename =~ s/^\\(.*)/$1/i;
508 if (-e $real_filename) {
509
510
511 if ($save_as eq "DSpace") {
512 if ($real_filename =~ m/$source_filename$/) {
513 next;
514 }
515 else {
516 my $bundle = "bundle:ORIGINAL";
517
518 if ($afile =~ m/^thumbnail\./) {
519 $bundle = "bundle:THUMBNAIL";
520 }
521
522 # Store the associated file to the "contents" file
523 print $handle "$assoc_file_rec->[1]\t$bundle\n";
524 }
525 }
526
527 $filename = &util::filename_cat($working_dir, $afile);
528
529
530 &util::hard_link ($real_filename, $filename);
531
532 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
533 "gsdlassocfile",
534 "$afile:$assoc_file_rec->[2]:$dir");
535 } elsif ($self->{'verbosity'} > 2) {
536 print $outhandle "docsave::process couldn't copy the associated file " .
537 "$real_filename to $afile\n";
538 }
539 }
540}
541
542
543sub close_file_output
544{
545 my ($self) = @_;
546 my $service =$self->{'service'};
547
548 # make sure that the handle has been opened - it won't be if we failed
549 # to import any documents...
550 if (defined(fileno(docsave::OUTDOC))) {
551 $self->output_xml_footer('docsave::OUTDOC');
552 close OUTDOC;
553 }
554
555 my $OID = $self->{'gs_OID'};
556 my $short_doc_file;
557 # can we use 'short_doc_file' for GA too?
558 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
559 $short_doc_file=$self->{'short_doc_file'};
560 } elsif ($self->{'saveas'} eq "GA") { # "GA"
561 $short_doc_file=$self->{'gs_short_filename'};
562 } else { # "DSpace"
563 }
564
565 if ($self->{'gzip'}) {
566 my $doc_file = $self->{'gs_filename'};
567 `gzip $doc_file`;
568 $doc_file .= ".gz";
569 $short_doc_file .= ".gz";
570 if (!-e $doc_file) {
571 my $outhandle = $self->{'outhandle'};
572 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
573 return 0;
574 }
575 }
576
577 # store reference in output_info
578 my $output_info = $self->getoutputinfo();
579 return 0 if (!defined $output_info);
580 $output_info->add_info($OID, $short_doc_file, undef, undef);
581
582 return 1;
583}
584
585sub output_xml_header {
586 my $self = shift (@_);
587 my ($handle) = @_;
588
589 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
590
591 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
592 print $handle "<Archive>\n";
593}
594
595sub output_xml_footer {
596 my $self = shift (@_);
597 my ($handle) = @_;
598
599 print $handle "</Archive>\n";
600}
601
602sub output_txt_xml_header{
603 my $self = shift (@_);
604 my ($handle) = @_;
605 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
606 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
607}
608
609sub output_txt_xml_footer{
610 my $self = shift(@_);
611 my ($handle) = @_;
612 # Nothing needs to be output at present
613}
614
615sub output_mets_xml_header(){
616 my $self = shift(@_);
617 my ($handle, $OID, $doc_title) = @_;
618
619 my $version = $self->{'saveas_version'};
620
621 my $extra_attr = "";
622 if ($version eq "fedora") {
623 my $fnamespace = $ENV{'FEDORA_PID_NAMESPACE'};
624 my $oid_namespace = (defined $fnamespace) ? $fnamespace : "test";
625
626 $extra_attr = "OBJID=\"$oid_namespace:$OID\" TYPE=\"FedoraObject\" LABEL=\"$doc_title\"";
627 }
628 else {
629 # Greenstone METS profile
630 $extra_attr = "OBJID=\"$OID:2\"";
631 }
632
633
634 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
635 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
636 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
637 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
638 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
639 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
640 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
641 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
642 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
643 print $handle " $extra_attr>\n";
644
645 if ($version eq "fedora") {
646 print $handle '<mets:metsHdr RECORDSTATUS="A"/>'. "\n"; # A = active
647 }
648
649}
650
651sub output_mets_xml_footer() {
652 my $self = shift(@_);
653 my ($handle) = @_;
654 print $handle '</mets:mets>' . "\n";
655}
656
657sub output_dc_xml_header(){
658 my $self = shift(@_);
659 my ($handle, $OID) = @_;
660
661 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
662# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">'."\n";
663 print $handle '<dublin_core>' . "\n";
664}
665
666sub output_dc_xml_footer() {
667 my $self = shift(@_);
668 my ($handle) = @_;
669 print $handle '</dublin_core>' . "\n";
670}
6711;
Note: See TracBrowser for help on using the repository browser.