source: gsdl/trunk/perllib/arcinfo.pm@ 18463

Last change on this file since 18463 was 18456, checked in by davidb, 15 years ago

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1###########################################################################
2#
3# arcinfo.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27# This module stores information about the archives. At the moment
28# this information just consists of the file name (relative to the
29# directory the archives information file is in) and its OID.
30
31# This module assumes there is a one to one correspondance between
32# a file in the archives directory and an OID.
33
34package arcinfo;
35
36use constant ORDER_OID_INDEX => 0;
37use constant ORDER_SORT_INDEX => 1;
38
39use constant INFO_FILE_INDEX => 0;
40use constant INFO_STATUS_INDEX => 1;
41
42use strict;
43
44use dbutil;
45
46# File format read in: OID <tab> Filename <tab> Optional-Index-Status
47
48# Index status can be:
49# I = Index for the first time
50# R = Reindex
51# D = Delete
52# B = Been indexed
53
54sub new {
55 my ($class) = @_;
56 my $self = {'info'=>{},
57 'order'=>[],
58 'reverse_sort'=>0};
59
60 return bless $self, $class;
61}
62
63sub _load_info_txt
64{
65 my $self = shift (@_);
66 my ($filename) = @_;
67
68 if (defined $filename && -e $filename) {
69 open (INFILE, $filename) ||
70 die "arcinfo::load_info couldn't read $filename\n";
71
72 my ($line, @line);
73 while (defined ($line = <INFILE>)) {
74 $line =~ s/\cM|\cJ//g; # remove end-of-line characters
75 @line = split ("\t", $line); # filename,
76 if (scalar(@line) >= 2) {
77 $self->add_info (@line);
78 }
79 }
80 close (INFILE);
81 }
82
83
84}
85
86sub _load_info_gdbm
87{
88 my $self = shift (@_);
89 my ($filename) = @_;
90
91 my $infodb_map = {};
92
93 &dbutil::read_infodb_file_gdbm($filename,$infodb_map);
94
95 foreach my $oid ( keys %$infodb_map ) {
96 my $vals = $infodb_map->{$oid};
97 # interested in doc-file and index-status
98
99 my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
100 my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
101 $self->add_info ($oid,$doc_file,$index_status);
102 }
103}
104
105sub load_info {
106 my $self = shift (@_);
107 my ($filename) = @_;
108
109 $self->{'info'} = {};
110
111 if ((defined $filename) && (-e $filename)) {
112 if ($filename =~ m/\.inf$/) {
113 $self->_load_info_txt($filename);
114 }
115 else {
116 $self->_load_info_gdbm($filename);
117 }
118 }
119}
120
121sub _load_filelist_gdbm
122{
123 my $self = shift (@_);
124 my ($filename) = @_;
125
126 my $infodb_map = {};
127
128 &dbutil::read_infodb_keys_gdbm($filename,$infodb_map);
129
130 foreach my $file ( keys %$infodb_map ) {
131 $self->{'prev_import_filelist'}->{$file} = 1;
132 }
133}
134
135
136sub load_prev_import_filelist {
137 my $self = shift (@_);
138 my ($filename) = @_;
139
140 $self->{'import-filelist'} = {};
141
142 if ((defined $filename) && (-e $filename)) {
143 if ($filename =~ m/\.inf$/) {
144 # e.g. 'archives-src.inf' (which includes complete list of file
145 # from last time import.pl was run)
146 $self->_load_info_txt($filename);
147 }
148 else {
149 $self->_load_filelist_gdbm($filename);
150 }
151 }
152}
153
154sub _save_info_txt {
155 my $self = shift (@_);
156 my ($filename) = @_;
157
158 my ($OID, $info);
159
160 open (OUTFILE, ">$filename") ||
161 die "arcinfo::save_info couldn't write $filename\n";
162
163 foreach $info (@{$self->get_OID_list()}) {
164 if (defined $info) {
165 print OUTFILE join("\t", @$info), "\n";
166 }
167 }
168 close (OUTFILE);
169}
170
171sub _save_info_gdbm {
172 my $self = shift (@_);
173 my ($filename) = @_;
174
175 # Not the most efficient operation, but will do for now
176
177 # read it in
178 my $infodb_map = {};
179 &dbutil::read_infodb_file_gdbm($filename,$infodb_map);
180
181 # change index-status values
182 foreach my $info (@{$self->get_OID_list()}) {
183 if (defined $info) {
184 my ($oid,$doc_file,$index_status) = @$info;
185 if (defined $infodb_map->{$oid}) {
186 my $vals_ref = \$infodb_map->{$oid};
187 $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
188 }
189 else {
190 print STDERR "Warning: $filename does not have key $oid\n";
191 }
192 }
193 }
194
195
196 # write out again
197 my $infodb_handle = &dbutil::open_infodb_write_handle_gdbm($filename);
198 foreach my $oid ( keys %$infodb_map ) {
199 # consider making the following a method in dbutil
200 # e.g. write_infodb_rawentry_gdbm($infodb_handle,$oid,$vals);
201
202 # no need to escape, as $infodb_map->{$oid} hasn't been unescaped
203 print $infodb_handle "[$oid]\n";
204 print $infodb_handle $infodb_map->{$oid};
205 print $infodb_handle '-' x 70, "\n";
206 }
207 &dbutil::close_infodb_write_handle_gdbm($infodb_handle);
208
209}
210
211sub save_info {
212 my $self = shift (@_);
213 my ($filename) = @_;
214
215 if ($filename =~ m/\.inf$/) {
216 $self->_save_info_txt($filename);
217 }
218 else {
219 $self->_save_info_gdbm($filename);
220 }
221}
222
223sub delete_info {
224 my $self = shift (@_);
225 my ($OID) = @_;
226
227 if (defined $self->{'info'}->{$OID}) {
228 delete $self->{'info'}->{$OID};
229
230 my $i = 0;
231 while ($i < scalar (@{$self->{'order'}})) {
232 if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
233 splice (@{$self->{'order'}}, $i, 1);
234 last;
235 }
236
237 $i ++;
238 }
239 }
240}
241
242sub add_info {
243 my $self = shift (@_);
244 my ($OID, $doc_file, $index_status, $sortmeta) = @_;
245 $sortmeta = "" unless defined $sortmeta;
246 $index_status = "I" unless defined $index_status; # I = needs indexing
247
248 if (! defined($OID)) {
249 # only happens when no files can be processed?
250 return undef;
251 }
252
253 $self->delete_info ($OID);
254 $self->{'info'}->{$OID} = [$doc_file,$index_status];
255 push (@{$self->{'order'}}, [$OID, $sortmeta]);
256}
257
258sub set_status_info {
259 my $self = shift (@_);
260 my ($OID, $index_status) = @_;
261
262 my $OID_info = $self->{'info'}->{$OID};
263 $OID_info->[INFO_STATUS_INDEX] = $index_status;
264}
265
266
267sub get_status_info {
268 my $self = shift (@_);
269 my ($OID) = @_;
270
271 my $index_status = undef;
272
273 my $OID_info = $self->{'info'}->{$OID};
274 if (defined $OID_info) {
275 $index_status = $OID_info->[INFO_STATUS_INDEX];
276 }
277 else {
278 die "Unable to find document id $OID\n";
279 }
280
281 return $index_status;
282
283}
284
285sub reverse_sort
286{
287 my $self = shift(@_);
288 $self->{'reverse_sort'} = 1;
289}
290
291# returns a list of the form [[OID, doc_file, index_status], ...]
292sub get_OID_list
293{
294 my $self = shift (@_);
295
296 my $order = $self->{'order'};
297
298 my @sorted_order;
299 if ($self->{'reverse_sort'}) {
300 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
301 } else {
302 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
303 }
304
305 my @list = ();
306
307 foreach my $OID_order (@sorted_order) {
308 my $OID = $OID_order->[ORDER_OID_INDEX];
309 my $OID_info = $self->{'info'}->{$OID};
310
311 push (@list, [$OID, $OID_info->[INFO_FILE_INDEX],
312 $OID_info->[INFO_STATUS_INDEX]]);
313 }
314
315 return \@list;
316}
317
318# returns a list of the form [[doc_file, OID], ...]
319sub get_file_list {
320 my $self = shift (@_);
321
322 my $order = $self->{'order'};
323
324 my @sorted_order;
325 if ($self->{'reverse_sort'}) {
326 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
327 } else {
328 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
329 }
330
331 my @list = ();
332
333 foreach my $OID_order (@sorted_order) {
334 my $OID = $OID_order->[ORDER_OID_INDEX];
335 my $OID_info = $self->{'info'}->{$OID};
336
337 push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
338 }
339
340 return \@list;
341}
342
343
344# returns a list of the form [doc_file]
345sub get_info {
346 my $self = shift (@_);
347 my ($OID) = @_;
348
349 if (defined $self->{'info'}->{$OID}) {
350 return $self->{'info'}->{$OID};
351 }
352
353 return undef;
354}
355
356
357# returns the number of documents so far
358sub size {
359 my $self = shift (@_);
360 return (scalar(@{$self->{'order'}}));
361}
362
3631;
364
Note: See TracBrowser for help on using the repository browser.