source: collections/trunk/nsa1/perllib/docsave.pm@ 14143

Last change on this file since 14143 was 3353, checked in by sjboddie, 22 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 6.3 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection
28
29# use with doc.pm rewound to version 1.25
30
31
32package docsave;
33
34use arcinfo;
35use docproc;
36use util;
37
38
39sub BEGIN {
40 @ISA = ('docproc');
41}
42
43sub new {
44 my ($class, $collection, $archive_info, $verbosity,
45 $gzip, $groupsize, $outhandle) = @_;
46 my $self = new docproc ();
47
48
49 $groupsize=1 unless defined $groupsize;
50 $self->{'collection'} = $collection;
51 $self->{'archive_info'} = $archive_info;
52 $self->{'verbosity'} = $verbosity;
53 $self->{'gzip'} = $gzip;
54
55 $self->{'groupsize'} = $groupsize;
56 $self->{'gs_count'} = 0;
57
58 $self->{'outhandle'} = STDERR;
59 $self->{'outhandle'} = $outhandle if defined $outhandle;
60
61 # set a default for the archive directory
62 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
63
64 $self->{'sortmeta'} = undef;
65
66 if ($groupsize != 1)
67 {
68 print STDERR "Warning: -groupsize not supported in BBC collection\n";
69 print STDERR " documents are automatically grouped by date\n";
70 }
71
72 return bless $self, $class;
73}
74
75sub setarchivedir {
76 my $self = shift (@_);
77 my ($archive_dir) = @_;
78
79 $self->{'archive_dir'} = $archive_dir;
80 &util::mk_all_dir ($archive_dir);
81
82 $self->{'lower_year'} = 1900;
83
84 my @lt_array =localtime(time);
85 my $lt_year = $lt_array[5];
86 $self->{'upper_year'} = 1900 + $lt_year;
87
88 print STDERR "*" x 70, "\n";
89 print STDERR "Processing dates between ";
90 print STDERR "$self->{'lower_year'} and $self->{'upper_year'}\n";
91 print STDERR "*" x 70, "\n\n";
92
93 # set up table of file handles -- one for each year
94 $self->{'date_files'} = [];
95 my $i;
96 for ($i=$self->{'lower_year'}; $i<=$self->{'upper_year'}; $i++)
97 {
98 my $FH = "OUTDATE$i";
99 my $date_file = &util::filename_cat($archive_dir, "$i.gml");
100 my $short_date_file = "$i.gml";
101
102 if (!open ($FH, ">$date_file"))
103 {
104 print STDERR "Warning: ";
105 print STDERR "docsave::new could not write to file $date_file\n";
106 $self->{'date_files'}->[$i-$self->{'lower_year'}] = undef;
107 next;
108 }
109
110 $self->{'date_files'}->[$i-$self->{'lower_year'}]
111 = { 'short_filename' => $short_date_file,
112 'long_filename' => $date_file,
113 'OUTDATE' => $FH };
114 }
115}
116
117sub process {
118 my $self = shift (@_);
119 my ($doc_obj) = @_;
120
121 my $archive_dir = $self->{'archive_dir'};
122 my $OID = $doc_obj->get_OID();
123 $OID = "NULL" unless defined $OID;
124
125 if (scalar(@{$doc_obj->get_assoc_files()})>0)
126 {
127 # get the document's directory.
128 my $doc_info = $self->{'archive_info'}->get_info($OID);
129 my $doc_dir = "";
130 if (defined $doc_info && scalar(@$doc_info) >= 1) {
131 # this OID already has an assigned directory, use the
132 # same one.
133 $doc_dir = $doc_info->[0];
134 $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
135
136 } else {
137 # have to get a new document directory
138 my $doc_dir_rest = $OID;
139 my $doc_dir_num = 0;
140 do {
141 $doc_dir .= "/" if $doc_dir_num > 0;
142 if ($doc_dir_rest =~ s/^(.{1,8})//) {
143 $doc_dir .= $1;
144 $doc_dir_num++;
145 }
146 } while ($doc_dir_rest ne "" &&
147 ((-d &util::filename_cat ($archive_dir, "$doc_dir.dir")) ||
148 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
149 $doc_dir .= ".dir";
150 }
151
152 &util::mk_all_dir ("$archive_dir/$doc_dir");
153
154 # copy all the associated files, add this information as metadata
155 # to the document
156 my @assoc_files = ();
157 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
158 if (-e $assoc_file->[0]) {
159 my $afile = &util::filename_cat($archive_dir, $doc_dir, $assoc_file->[1]);
160 &util::cp ($assoc_file->[0], $afile);
161 $doc_obj->add_metadata ($doc_obj->get_top_section(),
162 "gsdlassocfile",
163 "$assoc_file->[1]:$assoc_file->[2]");
164 } else {
165 print STDERR "docsave::process couldn't copy the associated file " .
166 "$assoc_file->[0] to $afile\n";
167 }
168 }
169 }
170
171 # save this document to appropriate DATE file
172 my $top_section = $doc_obj->get_top_section();
173 my $date_metadata = $doc_obj->get_metadata_element($top_section, "Date");
174
175 my ($year) = ($date_metadata =~ m/^(\d{4})/);
176 my $df_index = $year - $self->{'lower_year'};
177
178 if ($df_index<0)
179 {
180 print STDERR "Warning: Year $year is earlier than lower bound.";
181 }
182 elsif (!defined $self->{'date_files'}->[$df_index])
183 {
184 print STDERR "Warning: No file data present for year $year.";
185 }
186 else
187 {
188 my $df_rec = $self->{'date_files'}->[$df_index];
189 my $FH = $df_rec->{'OUTDATE'};
190
191 $doc_obj->output_section("docsave::".$FH, $top_section);
192 }
193}
194
195sub close_file_output
196{
197 my ($self) = @_;
198
199 my $i;
200 for ($i=$self->{'lower_year'}; $i<=$self->{'upper_year'}; $i++)
201 {
202 my $date_rec = $self->{'date_files'}->[$i-$self->{'lower_year'}];
203 close $date_rec->{'OUTDATE'};
204
205 my $short_doc_file = $date_rec->{'short_filename'};
206 my $doc_file = $date_rec->{'long_filename'};
207
208 if (-z $doc_file)
209 {
210 &util::rm($doc_file);
211 next;
212 }
213
214 if ($self->{'gzip'})
215 {
216 my $doc_file = $self->{'gs_filename'};
217 `gzip $doc_file`;
218 $doc_file .= ".gz";
219 $short_doc_file .= ".gz";
220 if (!-e $doc_file)
221 {
222 print STDERR "error while gzipping: $doc_file doesn't exist\n";
223 return 0;
224 }
225 }
226
227 # store reference in the archive_info
228 $self->{'archive_info'}->add_info("Year_$i", $short_doc_file);
229 }
230
231 return 1;
232}
233
2341;
Note: See TracBrowser for help on using the repository browser.