########################################################################### # # docsave.pm # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This document processor saves a document in the # archives directory of a collection # use with doc.pm rewound to version 1.25 package docsave; use arcinfo; use docproc; use util; sub BEGIN { @ISA = ('docproc'); } sub new { my ($class, $collection, $archive_info, $verbosity, $gzip, $groupsize, $outhandle) = @_; my $self = new docproc (); $groupsize=1 unless defined $groupsize; $self->{'collection'} = $collection; $self->{'archive_info'} = $archive_info; $self->{'verbosity'} = $verbosity; $self->{'gzip'} = $gzip; $self->{'groupsize'} = $groupsize; $self->{'gs_count'} = 0; $self->{'outhandle'} = STDERR; $self->{'outhandle'} = $outhandle if defined $outhandle; # set a default for the archive directory $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives"); $self->{'sortmeta'} = undef; if ($groupsize != 1) { print STDERR "Warning: -groupsize not supported in BBC collection\n"; print STDERR " documents are automatically grouped by date\n"; } return bless $self, $class; } sub setarchivedir { my $self = shift (@_); my ($archive_dir) = @_; $self->{'archive_dir'} = $archive_dir; &util::mk_all_dir ($archive_dir); $self->{'lower_year'} = 1900; my @lt_array =localtime(time); my $lt_year = $lt_array[5]; $self->{'upper_year'} = 1900 + $lt_year; print STDERR "*" x 70, "\n"; print STDERR "Processing dates between "; print STDERR "$self->{'lower_year'} and $self->{'upper_year'}\n"; print STDERR "*" x 70, "\n\n"; # set up table of file handles -- one for each year $self->{'date_files'} = []; my $i; for ($i=$self->{'lower_year'}; $i<=$self->{'upper_year'}; $i++) { my $FH = "OUTDATE$i"; my $date_file = &util::filename_cat($archive_dir, "$i.gml"); my $short_date_file = "$i.gml"; if (!open ($FH, ">$date_file")) { print STDERR "Warning: "; print STDERR "docsave::new could not write to file $date_file\n"; $self->{'date_files'}->[$i-$self->{'lower_year'}] = undef; next; } $self->{'date_files'}->[$i-$self->{'lower_year'}] = { 'short_filename' => $short_date_file, 'long_filename' => $date_file, 'OUTDATE' => $FH }; } } sub process { my $self = shift (@_); my ($doc_obj) = @_; my $archive_dir = $self->{'archive_dir'}; my $OID = $doc_obj->get_OID(); $OID = "NULL" unless defined $OID; if (scalar(@{$doc_obj->get_assoc_files()})>0) { # get the document's directory. my $doc_info = $self->{'archive_info'}->get_info($OID); my $doc_dir = ""; if (defined $doc_info && scalar(@$doc_info) >= 1) { # this OID already has an assigned directory, use the # same one. $doc_dir = $doc_info->[0]; $doc_dir =~ s/\/?doc\.gml(\.gz)?$//; } else { # have to get a new document directory my $doc_dir_rest = $OID; my $doc_dir_num = 0; do { $doc_dir .= "/" if $doc_dir_num > 0; if ($doc_dir_rest =~ s/^(.{1,8})//) { $doc_dir .= $1; $doc_dir_num++; } } while ($doc_dir_rest ne "" && ((-d &util::filename_cat ($archive_dir, "$doc_dir.dir")) || ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2))); $doc_dir .= ".dir"; } &util::mk_all_dir ("$archive_dir/$doc_dir"); # copy all the associated files, add this information as metadata # to the document my @assoc_files = (); foreach $assoc_file (@{$doc_obj->get_assoc_files()}) { if (-e $assoc_file->[0]) { my $afile = &util::filename_cat($archive_dir, $doc_dir, $assoc_file->[1]); &util::cp ($assoc_file->[0], $afile); $doc_obj->add_metadata ($doc_obj->get_top_section(), "gsdlassocfile", "$assoc_file->[1]:$assoc_file->[2]"); } else { print STDERR "docsave::process couldn't copy the associated file " . "$assoc_file->[0] to $afile\n"; } } } # save this document to appropriate DATE file my $top_section = $doc_obj->get_top_section(); my $date_metadata = $doc_obj->get_metadata_element($top_section, "Date"); my ($year) = ($date_metadata =~ m/^(\d{4})/); my $df_index = $year - $self->{'lower_year'}; if ($df_index<0) { print STDERR "Warning: Year $year is earlier than lower bound."; } elsif (!defined $self->{'date_files'}->[$df_index]) { print STDERR "Warning: No file data present for year $year."; } else { my $df_rec = $self->{'date_files'}->[$df_index]; my $FH = $df_rec->{'OUTDATE'}; $doc_obj->output_section("docsave::".$FH, $top_section); } } sub close_file_output { my ($self) = @_; my $i; for ($i=$self->{'lower_year'}; $i<=$self->{'upper_year'}; $i++) { my $date_rec = $self->{'date_files'}->[$i-$self->{'lower_year'}]; close $date_rec->{'OUTDATE'}; my $short_doc_file = $date_rec->{'short_filename'}; my $doc_file = $date_rec->{'long_filename'}; if (-z $doc_file) { &util::rm($doc_file); next; } if ($self->{'gzip'}) { my $doc_file = $self->{'gs_filename'}; `gzip $doc_file`; $doc_file .= ".gz"; $short_doc_file .= ".gz"; if (!-e $doc_file) { print STDERR "error while gzipping: $doc_file doesn't exist\n"; return 0; } } # store reference in the archive_info $self->{'archive_info'}->add_info("Year_$i", $short_doc_file); } return 1; } 1;