source: trunk/BBC_2005/collect/bbc/perllib/docsave.pm@ 10021

Last change on this file since 10021 was 10021, checked in by chi, 19 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1###########################################################################
2#
3# docsave.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor saves a document in the
27# archives directory of a collection
28
29# use with doc.pm rewound to version 1.25
30
31
32package docsave;
33
34use arcinfo;
35use docproc;
36use util;
37
38
39sub BEGIN {
40 @ISA = ('docproc');
41}
42
43sub new {
44 my ($class, $collection, $archive_info, $verbosity,
45 $gzip, $groupsize, $outhandle) = @_;
46 my $self = new docproc ();
47
48
49 $groupsize=1 unless defined $groupsize;
50 $self->{'collection'} = $collection;
51 $self->{'archive_info'} = $archive_info;
52 $self->{'verbosity'} = $verbosity;
53 $self->{'gzip'} = $gzip;
54
55 $self->{'groupsize'} = $groupsize;
56 $self->{'gs_count'} = 0;
57
58 $self->{'outhandle'} = STDERR;
59 $self->{'outhandle'} = $outhandle if defined $outhandle;
60
61 # set a default for the archive directory
62 $self->{'archive_dir'} = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
63
64 $self->{'sortmeta'} = undef;
65
66 if ($groupsize != 1)
67 {
68 print STDERR "Warning: -groupsize not supported in BBC collection\n";
69 print STDERR " documents are automatically grouped by date\n";
70 }
71
72 return bless $self, $class;
73}
74
75sub setarchivedir {
76 my $self = shift (@_);
77 my ($archive_dir) = @_;
78
79 $self->{'archive_dir'} = $archive_dir;
80 &util::mk_all_dir ($archive_dir);
81
82 $self->{'lower_year'} = 1800;
83
84 my @lt_array =localtime(time);
85 my $lt_year = $lt_array[5];
86 $self->{'upper_year'} = 1900 + $lt_year;
87
88 print STDERR "*" x 70, "\n";
89 print STDERR "Processing dates between ";
90 print STDERR "$self->{'lower_year'} and $self->{'upper_year'}\n";
91 print STDERR "*" x 70, "\n\n";
92
93 # set up table of file handles -- one for each year
94 $self->{'date_files'} = [];
95 my $i;
96 for ($i=$self->{'lower_year'}; $i<=$self->{'upper_year'}; $i++)
97 {
98 my $FH = "OUTDATE$i";
99 #print STDERR "****FH = $FH\n";
100 my $date_file = &util::filename_cat($archive_dir, "$i.gml");
101 my $short_date_file = "$i.gml";
102
103 if (!open ($FH, ">$date_file"))
104 {
105 print STDERR "Warning: ";
106 print STDERR "docsave::new could not write to file $date_file\n";
107 $self->{'date_files'}->[$i-$self->{'lower_year'}] = undef;
108 next;
109 }
110
111 $self->{'date_files'}->[$i-$self->{'lower_year'}]
112 = { 'short_filename' => $short_date_file,
113 'long_filename' => $date_file,
114 'OUTDATE' => $FH };
115 }
116}
117
118sub process {
119 my $self = shift (@_);
120 my ($doc_obj) = @_;
121
122 my $archive_dir = $self->{'archive_dir'};
123 my $OID = $doc_obj->get_OID();
124 $OID = "NULL" unless defined $OID;
125
126 if (scalar(@{$doc_obj->get_assoc_files()})>0)
127 {
128 # get the document's directory.
129 my $doc_info = $self->{'archive_info'}->get_info($OID);
130 my $doc_dir = "";
131 if (defined $doc_info && scalar(@$doc_info) >= 1) {
132 # this OID already has an assigned directory, use the
133 # same one.
134 $doc_dir = $doc_info->[0];
135 $doc_dir =~ s/\/?doc\.gml(\.gz)?$//;
136
137 } else {
138 # have to get a new document directory
139 my $doc_dir_rest = $OID;
140 my $doc_dir_num = 0;
141 do {
142 $doc_dir .= "/" if $doc_dir_num > 0;
143 if ($doc_dir_rest =~ s/^(.{1,8})//) {
144 $doc_dir .= $1;
145 $doc_dir_num++;
146 }
147 } while ($doc_dir_rest ne "" &&
148 ((-d &util::filename_cat ($archive_dir, "$doc_dir.dir")) ||
149 ($self->{'archive_info'}->size() >= 1024 && $doc_dir_num < 2)));
150 $doc_dir .= ".dir";
151 }
152
153 &util::mk_all_dir ("$archive_dir/$doc_dir");
154
155 # copy all the associated files, add this information as metadata
156 # to the document
157 my @assoc_files = ();
158 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
159 if (-e $assoc_file->[0]) {
160 my $afile = &util::filename_cat($archive_dir, $doc_dir, $assoc_file->[1]);
161 &util::cp ($assoc_file->[0], $afile);
162 $doc_obj->add_metadata ($doc_obj->get_top_section(),
163 "gsdlassocfile",
164 "$assoc_file->[1]:$assoc_file->[2]");
165 } else {
166 print STDERR "docsave::process couldn't copy the associated file " .
167 "$assoc_file->[0] to $afile\n";
168 }
169 }
170 }
171
172 # save this document to appropriate DATE file
173 my $top_section = $doc_obj->get_top_section();
174 my $date_metadata = $doc_obj->get_metadata_element($top_section, "Date");
175
176 my ($year) = ($date_metadata =~ m/^(\d{4})/);
177 my $df_index = $year - $self->{'lower_year'};
178
179 #print STDERR "**** File Year =$df_index\n";
180 if ($df_index<0)
181 {
182 print STDERR "Warning: Year $year is earlier than lower bound.";
183 }
184 elsif (!defined $self->{'date_files'}->[$df_index])
185 {
186 print STDERR "Warning: No file data present for year $year.";
187 }
188 else
189 {
190 my $df_rec = $self->{'date_files'}->[$df_index];
191 my $FH = $df_rec->{'OUTDATE'};
192
193 $doc_obj->output_section("docsave::".$FH, $top_section);
194 }
195}
196
197sub close_file_output
198{
199 my ($self) = @_;
200
201 my $i;
202 for ($i=$self->{'lower_year'}; $i<=$self->{'upper_year'}; $i++)
203 {
204 my $date_rec = $self->{'date_files'}->[$i-$self->{'lower_year'}];
205 close $date_rec->{'OUTDATE'};
206
207 my $short_doc_file = $date_rec->{'short_filename'};
208 my $doc_file = $date_rec->{'long_filename'};
209
210 if (-z $doc_file)
211 {
212 &util::rm($doc_file);
213 next;
214 }
215
216 if ($self->{'gzip'})
217 {
218 my $doc_file = $self->{'gs_filename'};
219 `gzip $doc_file`;
220 $doc_file .= ".gz";
221 $short_doc_file .= ".gz";
222 if (!-e $doc_file)
223 {
224 print STDERR "error while gzipping: $doc_file doesn't exist\n";
225 return 0;
226 }
227 }
228
229 # store reference in the archive_info
230 $self->{'archive_info'}->add_info("Year_$i", $short_doc_file);
231 }
232
233 return 1;
234}
235
2361;
Note: See TracBrowser for help on using the repository browser.