########################################################################### # # docsave.pm # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This document processor saves a document in the # archives directory of a collection package docsave; use arcinfo; use docproc; use util; sub BEGIN { @ISA = ('docproc'); } sub new { my ($class, $collection, $archive_info, $verbosity, $gzip, $groupsize, $outhandle) = @_; my $self = new docproc (); $groupsize=1 unless defined $groupsize; $self->{'collection'} = $collection; $self->{'archive_info'} = $archive_info; $self->{'verbosity'} = $verbosity; $self->{'gzip'} = $gzip; $self->{'groupsize'} = $groupsize; $self->{'gs_count'} = 0; # keep an associate array of all metavalues used by collection to # help generate the XML DTD $self->{'dtd_metadata'} = {}; $self->{'outhandle'} = STDERR; $self->{'outhandle'} = $outhandle if defined $outhandle; # set a default for the archive directory $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives"); $self->{'sortmeta'} = undef; return bless $self, $class; } sub setarchivedir { my $self = shift (@_); my ($archive_dir) = @_; &util::mk_all_dir ($archive_dir) unless -e $archive_dir; $self->{'archive_dir'} = $archive_dir; } sub set_sortmeta { my $self = shift (@_); my ($sortmeta) = @_; $self->{'sortmeta'} = $sortmeta; } sub process { my $self = shift (@_); my ($doc_obj) = @_; my $outhandle = $self->{'outhandle'}; if ($self->{'groupsize'} > 1) { $self->group_process ($doc_obj); } else { # groupsize is 1 (i.e. one document per GML file) so sortmeta # may be used my $OID = $doc_obj->get_OID(); $OID = "NULL" unless defined $OID; # get document's directory my $doc_dir = $self->get_doc_dir ($OID); # copy all the associated files, add this information as metadata # to the document $self->process_assoc_files ($doc_obj, $doc_dir); my $doc_file = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml"); my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml"); if (!open (OUTDOC, ">$doc_file")) { print $outhandle "docsave::process could not write to file $doc_file\n"; return; } # save this document $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section(), $self->{'collection'}, $self->{'dtd_metadata'},0); close OUTDOC; if ($self->{'gzip'}) { my $doc_file = $self->{'gs_filename'}; `gzip $doc_file`; $doc_file .= ".gz"; $short_doc_file .= ".gz"; if (!-e $doc_file) { print $outhandle "error while gzipping: $doc_file doesn't exist\n"; return 0; } } # do the sortmeta thing my ($metadata); if (defined ($self->{'sortmeta'})) { $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'}); } # store reference in the archive_info $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata); } } sub group_process { my $self = shift (@_); my ($doc_obj) = @_; my $outhandle = $self->{'outhandle'}; my $OID = $doc_obj->get_OID(); $OID = "NULL" unless defined $OID; my $groupsize = $self->{'groupsize'}; my $gs_count = $self->{'gs_count'}; my $open_new_file = (($gs_count % $groupsize)==0); # opening a new file, or document has assoicated files => directory needed if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) { # get document's directory my $doc_dir = $self->get_doc_dir ($OID); # copy all the associated files, add this information as metadata # to the document $self->process_assoc_files ($doc_obj, $doc_dir); if ($open_new_file) { # only if opening new file my $doc_file = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.gml"); my $short_doc_file = &util::filename_cat ($doc_dir, "doc.gml"); if ($gs_count>0) { return if (!$self->close_file_output()); } if (!open (OUTDOC, ">$doc_file")) { print $outhandle "docsave::group_process could not write to file $doc_file\n"; return; } $self->{'gs_filename'} = $doc_file; $self->{'gs_short_filename'} = $short_doc_file; $self->{'gs_OID'} = $OID; } } # save this document $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section(), $self->{'collection'}, $self->{'dtd_metadata'},0); $self->{'gs_count'}++; } sub get_doc_dir { my $self = shift (@_); my ($OID) = @_; my $doc_info = $self->{'archive_info'}->get_info($OID); my $doc_dir = ""; if (defined $doc_info && scalar(@$doc_info) >= 1) { # this OID already has an assigned directory, use the # same one. $doc_dir = $doc_info->[0]; $doc_dir =~ s/\/?doc\.gml(\.gz)?$//; } else { # have to get a new document directory my $doc_dir_rest = $OID; my $doc_dir_num = 0; do { $doc_dir .= "/" if $doc_dir_num > 0; if ($doc_dir_rest =~ s/^(.{1,8})//) { $doc_dir .= $1; $doc_dir_num++; } } while ($doc_dir_rest ne "" && ((-d &util::filename_cat ($self->{'archive_dir'}, "$doc_dir.dir")) || ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2))); $doc_dir .= ".dir"; } &util::mk_all_dir (&util::filename_cat ($self->{'archive_dir'}, $doc_dir)); return $doc_dir; } sub process_assoc_files { my $self = shift (@_); my ($doc_obj, $doc_dir) = @_; my $outhandle = $self->{'outhandle'}; my @assoc_files = (); foreach $assoc_file (@{$doc_obj->get_assoc_files()}) { my ($dir, $afile) = $assoc_file->[1] =~ /^(.*?)([^\/\\]+)$/; $dir = "" unless defined $dir; if (-e $assoc_file->[0]) { my $filepath = &util::filename_cat($self->{'archive_dir'}, $doc_dir, $afile); &util::hard_link ($assoc_file->[0], $filepath); $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), "gsdlassocfile", "$afile:$assoc_file->[2]:$dir"); $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), "assocfilepath", "$doc_dir"); } elsif ($self->{'verbosity'} > 2) { print $outhandle "docsave::process couldn't copy the associated file " . "$assoc_file->[0] to $afile\n"; } } } sub close_file_output { my ($self) = @_; close OUTDOC; my $OID = $self->{'gs_OID'}; my $short_doc_file = $self->{'gs_short_filename'}; if ($self->{'gzip'}) { my $doc_file = $self->{'gs_filename'}; `gzip $doc_file`; $doc_file .= ".gz"; $short_doc_file .= ".gz"; if (!-e $doc_file) { my $outhandle = $self->{'outhandle'}; print $outhandle "error while gzipping: $doc_file doesn't exist\n"; return 0; } } # store reference in the archive_info $self->{'archive_info'}->add_info($OID, $short_doc_file); return 1; } 1;