source: main/trunk/greenstone2/perllib/arcinfo.pm@ 21564

Last change on this file since 21564 was 21564, checked in by mdewsnip, 14 years ago

Changed lots of occurrences of "GDBM" in comments, variable names and function names, where the code isn't GDBM-specific. Part of making the code less GDBM-specific.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.5 KB
Line 
1###########################################################################
2#
3# arcinfo.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27# This module stores information about the archives. At the moment
28# this information just consists of the file name (relative to the
29# directory the archives information file is in) and its OID.
30
31# This module assumes there is a one to one correspondance between
32# a file in the archives directory and an OID.
33
34package arcinfo;
35
36use constant ORDER_OID_INDEX => 0;
37use constant ORDER_SORT_INDEX => 1;
38
39use constant INFO_FILE_INDEX => 0;
40use constant INFO_STATUS_INDEX => 1;
41
42use strict;
43
44use dbutil;
45
46
47# File format read in: OID <tab> Filename <tab> Optional-Index-Status
48
49# Index status can be:
50# I = Index for the first time
51# R = Reindex
52# D = Delete
53# B = Been indexed
54
55sub new {
56 my ($class) = @_;
57 my $self = {'info'=>{},
58 'reverse-info'=>{},
59 'order'=>[],
60 'reverse_sort'=>0};
61
62 return bless $self, $class;
63}
64
65sub _load_info_txt
66{
67 my $self = shift (@_);
68 my ($filename) = @_;
69
70 if (defined $filename && -e $filename) {
71 open (INFILE, $filename) ||
72 die "arcinfo::load_info couldn't read $filename\n";
73
74 my ($line, @line);
75 while (defined ($line = <INFILE>)) {
76 $line =~ s/\cM|\cJ//g; # remove end-of-line characters
77 @line = split ("\t", $line); # filename,
78 if (scalar(@line) >= 2) {
79 $self->add_info (@line);
80 }
81 }
82 close (INFILE);
83 }
84
85
86}
87
88sub _load_info_db
89{
90 my $self = shift (@_);
91 my ($filename) = @_;
92
93 my $infodb_map = {};
94
95 &dbutil::read_infodb_file("gdbm", $filename, $infodb_map);
96
97 foreach my $oid ( keys %$infodb_map ) {
98 my $vals = $infodb_map->{$oid};
99 # interested in doc-file and index-status
100
101 my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
102 my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
103 my ($sortmeta) = ($vals=~/^<sort-meta>(.*)$/m);
104 $self->add_info ($oid,$doc_file,$index_status,$sortmeta);
105 }
106}
107
108
109sub load_info {
110 my $self = shift (@_);
111 my ($filename) = @_;
112
113 $self->{'info'} = {};
114
115 if ((defined $filename) && (-e $filename)) {
116 if ($filename =~ m/\.inf$/) {
117 $self->_load_info_txt($filename);
118 }
119 else {
120 $self->_load_info_db($filename);
121 }
122 }
123}
124
125sub _load_filelist_db
126{
127 my $self = shift (@_);
128 my ($filename) = @_;
129
130 my $infodb_map = {};
131
132 &dbutil::read_infodb_keys("gdbm", $filename, $infodb_map);
133
134 foreach my $file ( keys %$infodb_map ) {
135 $self->{'prev_import_filelist'}->{$file} = 1;
136 }
137}
138
139
140sub load_prev_import_filelist {
141 my $self = shift (@_);
142 my ($filename) = @_;
143
144 $self->{'import-filelist'} = {};
145
146 if ((defined $filename) && (-e $filename)) {
147 if ($filename =~ m/\.inf$/) {
148 # e.g. 'archives-src.inf' (which includes complete list of file
149 # from last time import.pl was run)
150 $self->_load_info_txt($filename);
151 }
152 else {
153 $self->_load_filelist_db($filename);
154 }
155 }
156}
157
158sub load_revinfo_UNTESTED
159{
160 my $self = shift (@_);
161 my ($rev_filename) = @_;
162
163 my $rev_infodb_map = {};
164
165 &dbutil::read_infodb_file("gdbm", $rev_filename, $rev_infodb_map);
166
167 foreach my $srcfile ( keys %$rev_infodb_map ) {
168 my $vals = $rev_infodb_map->{$srcfile};
169
170 foreach my $OID ($vals =~ m/^<oid>(.*)$/gm) {
171 $self->add_revinfo($srcfile,$OID);
172 }
173 }
174}
175
176
177sub _save_info_txt {
178 my $self = shift (@_);
179 my ($filename) = @_;
180
181 my ($OID, $info);
182
183 open (OUTFILE, ">$filename") ||
184 die "arcinfo::save_info couldn't write $filename\n";
185
186 foreach $info (@{$self->get_OID_list()}) {
187 if (defined $info) {
188 print OUTFILE join("\t", @$info), "\n";
189 }
190 }
191 close (OUTFILE);
192}
193
194sub _save_info_db {
195 my $self = shift (@_);
196 my ($filename) = @_;
197
198 # Not the most efficient operation, but will do for now
199
200 # read it in
201 my $infodb_map = {};
202 &dbutil::read_infodb_file("gdbm", $filename, $infodb_map);
203
204 # change index-status values
205 foreach my $info (@{$self->get_OID_list()}) {
206 if (defined $info) {
207 my ($oid,$doc_file,$index_status) = @$info;
208 if (defined $infodb_map->{$oid}) {
209 my $vals_ref = \$infodb_map->{$oid};
210 $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
211 }
212 else {
213 print STDERR "Warning: $filename does not have key $oid\n";
214 }
215 }
216 }
217
218
219 # write out again
220 my $infodb_handle = &dbutil::open_infodb_write_handle("gdbm", $filename);
221 foreach my $oid ( keys %$infodb_map ) {
222 # consider making the following a method in dbutil
223 # e.g. write_infodb_rawentry_gdbm($infodb_handle,$oid,$vals);
224
225 # no need to escape, as $infodb_map->{$oid} hasn't been unescaped
226 print $infodb_handle "[$oid]\n";
227 print $infodb_handle $infodb_map->{$oid};
228 print $infodb_handle '-' x 70, "\n";
229 }
230 &dbutil::close_infodb_write_handle("gdbm", $infodb_handle);
231
232}
233
234sub save_revinfo_db {
235 my $self = shift (@_);
236 my ($rev_filename) = @_;
237
238 # Output reverse lookup database
239
240 my $rev_infodb_map = $self->{'reverse-info'};
241 my $rev_infodb_handle
242 = &dbutil::open_infodb_write_handle("gdbm", $rev_filename, "append");
243
244 foreach my $key ( keys %$rev_infodb_map ) {
245 my $val_hash = $rev_infodb_map->{$key};
246 &dbutil::write_infodb_entry("gdbm", $rev_infodb_handle, $key, $val_hash);
247 }
248 &dbutil::close_infodb_write_handle("gdbm", $rev_infodb_handle);
249
250}
251
252sub save_info {
253 my $self = shift (@_);
254 my ($filename) = @_;
255
256 if ($filename =~ m/(contents)|(\.inf)$/) {
257 $self->_save_info_txt($filename);
258 }
259 else {
260 $self->_save_info_db($filename);
261 }
262}
263
264sub delete_info {
265 my $self = shift (@_);
266 my ($OID) = @_;
267
268 if (defined $self->{'info'}->{$OID}) {
269 delete $self->{'info'}->{$OID};
270
271 my $i = 0;
272 while ($i < scalar (@{$self->{'order'}})) {
273 if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
274 splice (@{$self->{'order'}}, $i, 1);
275 last;
276 }
277
278 $i ++;
279 }
280 }
281}
282
283sub add_info {
284 my $self = shift (@_);
285 my ($OID, $doc_file, $index_status, $sortmeta) = @_;
286 $sortmeta = "" unless defined $sortmeta;
287 $index_status = "I" unless defined $index_status; # I = needs indexing
288
289 if (! defined($OID)) {
290 # only happens when no files can be processed?
291 return undef;
292 }
293
294 if (defined $self->{'info'}->{$OID}) {
295 # test to see if we are in a reindex situation
296
297 my $existing_status_info = $self->get_status_info($OID);
298
299 if ($existing_status_info eq "D") {
300 # yes, we're in a reindexing situation
301 $self->delete_info ($OID);
302
303
304 # force setting to "reindex"
305 $index_status = "R";
306
307 }
308 else {
309 # some other, possibly erroneous, situation has arisen
310 # where the document already seems to exist
311 print STDERR "Warning: $OID already exists with index status $existing_status_info\n";
312 print STDERR " Deleting previous version\n";
313
314 $self->delete_info ($OID);
315 }
316 }
317
318 $self->{'info'}->{$OID} = [$doc_file,$index_status,$sortmeta];
319 push (@{$self->{'order'}}, [$OID, $sortmeta]);
320
321
322}
323
324sub set_status_info {
325 my $self = shift (@_);
326 my ($OID, $index_status) = @_;
327
328 my $OID_info = $self->{'info'}->{$OID};
329 $OID_info->[INFO_STATUS_INDEX] = $index_status;
330}
331
332
333sub get_status_info {
334 my $self = shift (@_);
335 my ($OID) = @_;
336
337 my $index_status = undef;
338
339 my $OID_info = $self->{'info'}->{$OID};
340 if (defined $OID_info) {
341 $index_status = $OID_info->[INFO_STATUS_INDEX];
342 }
343 else {
344 die "Unable to find document id $OID\n";
345 }
346
347 return $index_status;
348
349}
350
351
352sub add_reverseinfo {
353 my $self = shift (@_);
354 my ($key, $OID) = @_;
355
356 my $existing_key = $self->{'reverse-info'}->{$key};
357 if (!defined $existing_key) {
358 $existing_key = {};
359 $self->{'reverse-info'}->{$key} = $existing_key;
360 }
361
362 my $existing_oid = $existing_key->{'oid'};
363 if (!defined $existing_oid) {
364 $existing_oid = [];
365 $existing_key->{'oid'} = $existing_oid;
366 }
367
368 push(@$existing_oid,$OID);
369}
370
371sub set_meta_file_flag {
372 my $self = shift (@_);
373 my ($key) = @_;
374
375 my $existing_key = $self->{'reverse-info'}->{$key};
376 if (!defined $existing_key) {
377 $existing_key = {};
378 $self->{'reverse-info'}->{$key} = $existing_key;
379 }
380
381 $existing_key->{'meta-file'} = ["1"];
382
383}
384sub reverse_sort
385{
386 my $self = shift(@_);
387 $self->{'reverse_sort'} = 1;
388}
389
390# returns a list of the form [[OID, doc_file, index_status], ...]
391sub get_OID_list
392{
393 my $self = shift (@_);
394
395 my $order = $self->{'order'};
396
397 my @sorted_order;
398 if ($self->{'reverse_sort'}) {
399 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
400 } else {
401 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
402 }
403
404 my @list = ();
405
406 foreach my $OID_order (@sorted_order) {
407 my $OID = $OID_order->[ORDER_OID_INDEX];
408 my $OID_info = $self->{'info'}->{$OID};
409
410 push (@list, [$OID, $OID_info->[INFO_FILE_INDEX],
411 $OID_info->[INFO_STATUS_INDEX]]);
412 }
413
414 return \@list;
415}
416
417# returns a list of the form [[doc_file, OID], ...]
418sub get_file_list {
419 my $self = shift (@_);
420
421 my $order = $self->{'order'};
422
423 my @sorted_order;
424 if ($self->{'reverse_sort'}) {
425 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
426 } else {
427 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
428 }
429
430 my @list = ();
431
432 foreach my $OID_order (@sorted_order) {
433 my $OID = $OID_order->[ORDER_OID_INDEX];
434 my $OID_info = $self->{'info'}->{$OID};
435
436 push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
437 }
438
439 return \@list;
440}
441
442
443# returns a list of the form [doc_file,index_status,$sort_meta]
444sub get_info {
445 my $self = shift (@_);
446 my ($OID) = @_;
447
448 if (defined $self->{'info'}->{$OID}) {
449 return $self->{'info'}->{$OID};
450 }
451
452 return undef;
453}
454
455
456
457# returns the number of documents so far
458sub size {
459 my $self = shift (@_);
460 return (scalar(@{$self->{'order'}}));
461}
462
4631;
464
Note: See TracBrowser for help on using the repository browser.