source: trunk/gsdl/perllib/docsave.pm@ 11997

Last change on this file since 11997 was 11896, checked in by kjdon, 18 years ago

fixed line 521 - matching on a windows path gives an error like can't find unicode character property, because it has \Pr in it. so changes the match to an eq

  • Property svn:keywords set to Author Date Id Revision
File size: 19.3 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection (as xml)
28
29use strict;
30no strict 'refs';
31
32package docsave;
33
34eval {require bytes};
35
36use arcinfo;
37use docproc;
38use util;
39
40
41sub BEGIN {
42 @docsave::ISA = ('docproc');
43}
44
45
46sub calcoutputdir
47{
48 my ($service,$collectdir,$outhandle) = @_;
49
50 my $output_dir = undef;
51
52 if (($service eq "import") || ($service eq "unbuild")) {
53 $output_dir = &util::filename_cat ($collectdir, "archives");
54 }
55 elsif ($service eq "export") {
56 $output_dir = &util::filename_cat($collectdir, "export");
57 }
58 else {
59 print $outhandle "docsave::calcoutputdir did not recognise service ";
60 print $outhandle " '$service'. No output directory set.\n";
61 }
62
63 return $output_dir;
64}
65
66
67sub new {
68 my ($class, $collection, $info, $verbosity,
69 $gzip, $groupsize, $outhandle, $service, $saveas) = @_;
70 my $self = new docproc ();
71
72 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
73
74 $outhandle = 'STDERR' unless (defined $outhandle);
75 $service = "import" unless (defined $service);
76 $saveas = "GA" unless (defined $saveas);
77 $groupsize = 1 unless (defined $groupsize);
78
79 $self->{'collection'} = $collection;
80
81 $self->{'output_info'} = $info;
82 $self->{'output_dir'} = undef;
83
84 $self->{'verbosity'} = $verbosity;
85 $self->{'gzip'} = $gzip;
86 $self->{'keepimportstructure'} = 0;
87 $self->{'groupsize'} = $groupsize;
88 $self->{'gs_count'} = 0;
89
90 $self->{'outhandle'} = $outhandle;
91 $self->{'service'} = $service;
92 $self->{'saveas'} = $saveas;
93
94 $self->{'sortmeta'} = undef;
95
96 return bless $self, $class;
97}
98
99
100
101sub setoutputdir
102{
103 my $self = shift @_;
104 my ($output_dir) = @_;
105
106 &util::mk_all_dir ($output_dir) unless -e $output_dir;
107
108 $self->{'output_dir'} = $output_dir;
109}
110
111sub getoutputinfo
112{
113 my $self = shift (@_);
114
115 return $self->{'output_info'};
116}
117
118sub getoutputdir
119{
120 my $self = shift (@_);
121
122 return $self->{'output_dir'};
123}
124
125
126sub set_sortmeta {
127 my $self = shift (@_);
128 my ($sortmeta, $removeprefix, $removesuffix) = @_;
129
130 $self->{'sortmeta'} = $sortmeta;
131 if (defined ($removeprefix) && $removeprefix ) {
132 $removeprefix =~ s/^\^//; # don't need a leading ^
133 $self->{'removeprefix'} = $removeprefix;
134 }
135 if (defined ($removesuffix) && $removesuffix) {
136 $removesuffix =~ s/\$$//; # don't need a trailing $
137 $self->{'removesuffix'} = $removesuffix;
138 }
139}
140
141sub process {
142 my $self = shift (@_);
143 my ($doc_obj) = @_;
144
145 my $outhandle = $self->{'outhandle'};
146 my $service = $self->{'service'};
147
148 # Define the SaveAs Type
149 my $save_as = $self->{'saveas'};
150 my $collection = $self->{'collection'};
151
152 # set the lastmodified time before we write it out.
153 $doc_obj->set_lastmodified();
154
155 if ($self->{'groupsize'} > 1) {
156 $self->group_process ($doc_obj);
157 return;
158 }
159
160 my $OID = $doc_obj->get_OID();
161 $OID = "NULL" unless defined $OID;
162
163 my $top_section = $doc_obj->get_top_section();
164
165 # get document's directory
166 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
167
168 # groupsize is 1 (i.e. one document per XML file) so sortmeta
169 # may be used
170
171 my $output_info = $self->getoutputinfo();
172 return if (!defined $output_info);
173
174 my $output_dir = $self->getoutputdir();
175 my $working_dir = &util::filename_cat ($output_dir, $doc_dir);
176
177 # copy all the associated files, add this information as metadata
178 # to the document
179 if ($save_as eq "DSpace") {
180
181 # Genereate handle file
182 # (Note: this section of code would benefit from being restructured)
183 my $doc_handle_file = &util::filename_cat ($working_dir, "handle");
184
185 my $env_hp = $ENV{'DSPACE_HANDLE_PREFIX'};
186 my $handle_prefix = (defined $env_hp) ? $env_hp : "123456789";
187
188 if (!open(OUTDOC_HANDLE,">$doc_handle_file")){
189 print $outhandle "docsave::process could not write collection handle to file $doc_handle_file\n";
190 return;
191 }
192
193 my ($handle) = ($doc_dir =~ m/^(.*)\.dir$/);
194 print OUTDOC_HANDLE "$handle_prefix/$handle\n";
195
196 close OUTDOC_HANDLE;
197
198 # Generate contents file
199 my $doc_contents_file = &util::filename_cat ($working_dir, "contents");
200
201 if (!open(OUTDOC_CONTENTS,">$doc_contents_file")){
202 print $outhandle "docsave::process could not write collection contents to file $doc_contents_file\n";
203 return;
204 }
205 $self->process_assoc_files ($doc_obj, $doc_dir, 'docsave::OUTDOC_CONTENTS');
206
207 close OUTDOC_CONTENTS;
208
209 } else {
210 $self->process_assoc_files ($doc_obj, $doc_dir, '');
211 }
212
213 # Save the document in the requested 'save_as' format
214
215 if ($save_as eq "GA") {
216
217 my $doc_file = &util::filename_cat ($working_dir, "doc.xml");
218
219 if (!open (OUTDOC, ">$doc_file")) {
220 print $outhandle "docsave::process could not write to file $doc_file\n";
221 return;
222 }
223
224 # save this document
225 $self->output_xml_header('docsave::OUTDOC');
226 $doc_obj->output_section('docsave::OUTDOC',$top_section);
227 $self->output_xml_footer('docsave::OUTDOC');
228
229 close OUTDOC;
230 }
231 elsif ($save_as eq "METS") {
232
233 my $doc_txt_file = &util::filename_cat ($working_dir,"doctxt.xml");
234
235 if (!open(OUTDOC_TXT, ">$doc_txt_file")){
236 print $outhandle "docsave::process could not write to file $doc_txt_file\n";
237 return;
238 }
239
240 $self->output_txt_xml_header('docsave::OUTDOC_TXT');
241 $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $top_section);
242 $self->output_txt_xml_footer('docsave::OUTDOC_TXT');
243
244 close OUTDOC_TXT;
245
246 # Now save the document with metadata and text structure to docmets.xml
247
248 my $doc_mets_file = &util::filename_cat ($working_dir, "docmets.xml");
249
250 my $doc_title = $doc_obj->get_metadata_element($top_section,"dc.Title");
251 if (!defined $doc_title) {
252 $doc_title = $doc_obj->get_metadata_element($top_section,"Title");
253 }
254
255 if (!open(OUTDOC_METS,">$doc_mets_file")){
256 print $outhandle "docsave::process could not write to file $doc_mets_file\n";
257 return;
258 }
259
260 my $saveas_version = $self->{'saveas_version'};
261 $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID, $doc_title);
262 $doc_obj->output_mets_section('docsave::OUTDOC_METS',$top_section,$saveas_version,$working_dir);
263 $self->output_mets_xml_footer('docsave::OUTDOC_METS');
264
265 close OUTDOC_METS;
266 }
267 elsif ($save_as eq "DSpace") {
268
269 # Generate dublin_core.xml file
270 my $doc_dc_file = &util::filename_cat ($working_dir, "dublin_core.xml");
271
272 if (!open(OUTDOC_DC,">$doc_dc_file")){
273 print $outhandle "docsave::process could not write dublin core to file $doc_dc_file\n";
274 return;
275 }
276
277 my $saveas_version = $self->{'saveas_version'};
278
279 $self->output_dc_xml_header('docsave::OUTDOC_DC', $OID);
280 $doc_obj->output_dc_section('docsave::OUTDOC_DC',$top_section);
281 $self->output_dc_xml_footer('docsave::OUTDOC_DC');
282
283 close OUTDOC_DC;
284 } else { # save_as isn't one of the recognised types
285 print $outhandle "docsave::process unrecognised saveas type, $save_as\n";
286 return;
287 }
288
289
290 my $short_doc_file;
291
292 if ($save_as eq "GA") {
293 $short_doc_file = util::filename_cat ($doc_dir, "doc.xml");
294 } elsif ($save_as eq "METS") {
295 $short_doc_file = &util::filename_cat ($doc_dir, "docmets.xml");
296 } elsif ($save_as eq "DSpace") {
297 $short_doc_file=&util::filename_cat ($doc_dir, "dublin_core.xml");
298 } else {
299 return;
300 }
301
302 #save for later (for close_file_output())
303 $self->{'short_doc_file'} = $short_doc_file;
304
305 if ($self->{'gzip'}) {
306 my $doc_file = $self->{'gs_filename'};
307 `gzip $doc_file`;
308 $doc_file .= ".gz";
309 $short_doc_file .= ".gz";
310 if (!-e $doc_file) {
311 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
312 return 0;
313 }
314 }
315
316 # do the sortmeta thing
317 my ($metadata);
318 if (defined ($self->{'sortmeta'})) {
319 $metadata = $doc_obj->get_metadata_element($top_section,$self->{'sortmeta'});
320 }
321 if (defined ($metadata) && $metadata) {
322 # do remove prefix/suffix
323 if (defined($self->{'removeprefix'})) {
324 $metadata =~ s/^$self->{'removeprefix'}//;
325 }
326 if (defined($self->{'removesuffix'})) {
327 $metadata =~ s/$self->{'removesuffix'}$//;
328 }
329 $metadata = &sorttools::format_metadata_for_sorting($self->{'sortmeta'}, $metadata, $doc_obj);
330 }
331
332 # store reference in the output_info
333 $output_info->add_info($OID, $short_doc_file, undef, $metadata);
334}
335
336# /**
337# */
338sub group_process {
339
340 my $self = shift (@_);
341 my ($doc_obj) = @_;
342 my $outhandle = $self->{'outhandle'};
343
344 my $OID = $doc_obj->get_OID();
345 $OID = "NULL" unless defined $OID;
346
347 my $groupsize = $self->{'groupsize'};
348 my $gs_count = $self->{'gs_count'};
349 my $open_new_file = (($gs_count % $groupsize)==0);
350
351 # opening a new file, or document has assoicated files => directory needed
352 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0))
353 {
354 # The directory the archive file (doc.xml) and all associated files
355 # should end up in
356 my $doc_dir;
357 # If we've determined its time for a new file, open it now
358 if ($open_new_file || !defined($self->{'gs_doc_dir'}))
359 {
360 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
361 # only if opening new file
362 my $output_dir = $self->getoutputdir();
363
364 my $doc_file = &util::filename_cat ($output_dir, $doc_dir, "doc.xml");
365 my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
366
367 if ($gs_count>0)
368 {
369 return if (!$self->close_file_output());
370 }
371
372 if (!open (OUTDOC, ">$doc_file")) {
373 print $outhandle "docsave::group_process could not write to file $doc_file\n";
374 return;
375 }
376 $self->{'gs_filename'} = $doc_file;
377 $self->{'gs_short_filename'} = $short_doc_file;
378 $self->{'gs_OID'} = $OID;
379 $self->{'gs_doc_dir'} = $doc_dir;
380
381 $self->output_xml_header('docsave::OUTDOC');
382 }
383 # Otherwise load the same archive document directory used last time
384 else
385 {
386 $doc_dir = $self->{'gs_doc_dir'};
387 }
388 # copy all the associated files, add this information as metadata
389 # to the document
390 print STDERR "Writing associated files to $doc_dir\n";
391 $self->process_assoc_files ($doc_obj, $doc_dir);
392 }
393
394 # save this document
395 $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
396
397 $self->{'gs_count'}++;
398}
399
400sub get_doc_dir {
401 my $self = shift (@_);
402 my ($OID, $source_filename) = @_;
403
404 my $service = $self-> {'service'};
405
406 my $working_dir = $self->getoutputdir();
407 my $working_info = $self->getoutputinfo();
408 return if (!defined $working_info);
409
410 my $doc_info = $working_info->get_info($OID);
411 my $doc_dir = '';
412
413 if (defined $doc_info && scalar(@$doc_info) >= 1) {
414 # this OID already has an assigned directory, use the
415 # same one.
416 $doc_dir = $doc_info->[0];
417 $doc_dir =~ s/\/?((doc(mets)?)|(dublin_core))\.xml(\.gz)?$//;
418 } elsif ($self->{'keepimportstructure'}) {
419 $source_filename = &File::Basename::dirname($source_filename);
420 $source_filename =~ s/[\\\/]+/\//g;
421 $source_filename =~ s/\/$//;
422
423
424 #print STDERR "Source filename: $source_filename; \nImport dir:",$ENV{'GSDLIMPORTDIR'}, "\n";
425 $doc_dir = substr($source_filename, length($ENV{'GSDLIMPORTDIR'}) + 1);
426
427 }
428 if ($doc_dir eq "") {
429 # have to get a new document directory
430
431 if (($service eq "import") || ($service eq "unbuild")) {
432 my $doc_dir_rest = $OID;
433 my $doc_dir_num = 0;
434
435 do {
436 $doc_dir .= "/" if $doc_dir_num > 0;
437 if ($doc_dir_rest =~ s/^(.{1,8})//) {
438 $doc_dir .= $1;
439 $doc_dir_num++;
440 }
441 } while ($doc_dir_rest ne "" &&
442 ((-d &util::filename_cat ($working_dir, "$doc_dir.dir")) ||
443 ($working_info->size() >= 1024 && $doc_dir_num < 2)));
444 }
445 else {
446 # Export formats such as DSpace need the directory structure to
447 # be flat. This is simple to arrange (set 'doc_dir' to be the
448 # documents OID) but breaks Windows 3.1 file system compliance.
449 # Such a loss is not a big thing in this situation as such
450 # systems don't run on Windows 3.1 anyway.
451
452 $doc_dir = $OID;
453 }
454
455
456 $doc_dir .= ".dir";
457 &util::mk_all_dir (&util::filename_cat ($working_dir, $doc_dir));
458 }
459 return $doc_dir;
460}
461
462sub process_assoc_files {
463 my $self = shift (@_);
464 my ($doc_obj, $doc_dir, $handle) = @_;
465
466 my $outhandle = $self->{'outhandle'};
467 my $service = $self->{'service'};
468 my $save_as = $self->{'saveas'};
469
470 my $output_dir = $self->getoutputdir();
471 return if (!defined $output_dir);
472
473 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
474
475 my @assoc_files = ();
476 my $filename;;
477
478 my $source_filename = $doc_obj->get_source_filename();
479
480 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
481
482 if (defined $collect_dir) {
483 my $dirsep_regexp = &util::get_os_dirsep();
484
485 if ($collect_dir !~ /$dirsep_regexp$/) {
486 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
487 }
488
489 # This test is never going to fail on Windows -- is this a problem?
490 if ($source_filename !~ /^$dirsep_regexp/) {
491 $source_filename = &util::filename_cat($collect_dir, $source_filename);
492 }
493 }
494
495
496 if ($save_as eq "DSpace") {
497 my ($tail_filename) = ($source_filename =~ m/\/([^\/\\]*)$/);
498
499 print $handle "$tail_filename\n";
500
501 $filename = &util::filename_cat($working_dir, $tail_filename);
502 &util::hard_link ($source_filename, $filename);
503 }
504
505 # set the assocfile path (even if we have no assoc files - need this for lucene)
506 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
507 "assocfilepath",
508 "$doc_dir");
509 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
510 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
511 $dir = "" unless defined $dir;
512
513
514 my $real_filename = $assoc_file_rec->[0];
515 # for some reasons the image associate file has / before the full path
516 $real_filename =~ s/^\\(.*)/$1/i;
517 if (-e $real_filename) {
518
519
520 if ($save_as eq "DSpace") {
521 if ($real_filename eq $source_filename) {
522 next;
523 }
524 else {
525 my $bundle = "bundle:ORIGINAL";
526
527 if ($afile =~ m/^thumbnail\./) {
528 $bundle = "bundle:THUMBNAIL";
529 }
530
531 # Store the associated file to the "contents" file
532 print $handle "$assoc_file_rec->[1]\t$bundle\n";
533 }
534 }
535
536 $filename = &util::filename_cat($working_dir, $afile);
537
538
539 &util::hard_link ($real_filename, $filename);
540
541 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
542 "gsdlassocfile",
543 "$afile:$assoc_file_rec->[2]:$dir");
544 } elsif ($self->{'verbosity'} > 2) {
545 print $outhandle "docsave::process couldn't copy the associated file " .
546 "$real_filename to $afile\n";
547 }
548 }
549}
550
551
552sub close_file_output
553{
554 my ($self) = @_;
555 my $service =$self->{'service'};
556
557 # make sure that the handle has been opened - it won't be if we failed
558 # to import any documents...
559 if (defined(fileno(docsave::OUTDOC))) {
560 $self->output_xml_footer('docsave::OUTDOC');
561 close OUTDOC;
562 }
563
564 my $OID = $self->{'gs_OID'};
565 my $short_doc_file;
566 # can we use 'short_doc_file' for GA too?
567 if (exists($self->{'saveas'}) && $self->{'saveas'} eq "METS") {
568 $short_doc_file=$self->{'short_doc_file'};
569 } elsif ($self->{'saveas'} eq "GA") { # "GA"
570 $short_doc_file=$self->{'gs_short_filename'};
571 } else { # "DSpace"
572 }
573
574 if ($self->{'gzip'}) {
575 my $doc_file = $self->{'gs_filename'};
576 `gzip $doc_file`;
577 $doc_file .= ".gz";
578 $short_doc_file .= ".gz";
579 if (!-e $doc_file) {
580 my $outhandle = $self->{'outhandle'};
581 print $outhandle "error while gzipping: $doc_file doesn't exist\n";
582 return 0;
583 }
584 }
585
586 # store reference in output_info
587 my $output_info = $self->getoutputinfo();
588 return 0 if (!defined $output_info);
589 $output_info->add_info($OID, $short_doc_file, undef, undef);
590
591 return 1;
592}
593
594sub output_xml_header {
595 my $self = shift (@_);
596 my ($handle) = @_;
597
598 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
599
600 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
601 print $handle "<Archive>\n";
602}
603
604sub output_xml_footer {
605 my $self = shift (@_);
606 my ($handle) = @_;
607
608 print $handle "</Archive>\n";
609}
610
611sub output_txt_xml_header{
612 my $self = shift (@_);
613 my ($handle) = @_;
614 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
615 print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
616}
617
618sub output_txt_xml_footer{
619 my $self = shift(@_);
620 my ($handle) = @_;
621 # Nothing needs to be output at present
622}
623
624sub output_mets_xml_header(){
625 my $self = shift(@_);
626 my ($handle, $OID, $doc_title) = @_;
627
628 my $version = $self->{'saveas_version'};
629
630 my $extra_attr = "";
631 if ($version eq "fedora") {
632 my $fnamespace = $ENV{'FEDORA_PID_NAMESPACE'};
633 my $oid_namespace = (defined $fnamespace) ? $fnamespace : "test";
634
635 $extra_attr = "OBJID=\"$oid_namespace:$OID\" TYPE=\"FedoraObject\" LABEL=\"$doc_title\"";
636 }
637 else {
638 # Greenstone METS profile
639 $extra_attr = "OBJID=\"$OID:2\"";
640 }
641
642
643 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
644 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
645 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
646 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
647 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
648 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
649 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
650 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
651 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
652 print $handle " $extra_attr>\n";
653
654 if ($version eq "fedora") {
655 print $handle '<mets:metsHdr RECORDSTATUS="A"/>'. "\n"; # A = active
656 }
657
658}
659
660sub output_mets_xml_footer() {
661 my $self = shift(@_);
662 my ($handle) = @_;
663 print $handle '</mets:mets>' . "\n";
664}
665
666sub output_dc_xml_header(){
667 my $self = shift(@_);
668 my ($handle, $OID) = @_;
669
670 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
671# print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">'."\n";
672 print $handle '<dublin_core>' . "\n";
673}
674
675sub output_dc_xml_footer() {
676 my $self = shift(@_);
677 my ($handle) = @_;
678 print $handle '</dublin_core>' . "\n";
679}
6801;
Note: See TracBrowser for help on using the repository browser.