source: main/trunk/greenstone2/perllib/arcinfo.pm@ 27982

Last change on this file since 27982 was 27697, checked in by ak19, 11 years ago

Dr Bainbridge fixed it so that the gdb files generated on Windows for diffcol match those on Linux. This actually involved changing the order in which docids appear in archiveinf-doc. This last needed the newly invented flag -sort to the ArchivesInfPlugin in combination with -sortmeta OID to import.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# arcinfo.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27# This module stores information about the archives. At the moment
28# this information just consists of the file name (relative to the
29# directory the archives information file is in) and its OID.
30
31# This module assumes there is a one to one correspondance between
32# a file in the archives directory and an OID.
33
34package arcinfo;
35
36use constant ORDER_OID_INDEX => 0;
37use constant ORDER_SORT_INDEX => 1;
38
39use constant INFO_FILE_INDEX => 0;
40use constant INFO_STATUS_INDEX => 1;
41
42use strict;
43
44use dbutil;
45
46
47# File format read in: OID <tab> Filename <tab> Optional-Index-Status
48
49# Index status can be:
50# I = Index for the first time
51# R = Reindex
52# D = Delete
53# B = Been indexed
54
55sub new {
56 my $class = shift(@_);
57 my $infodbtype = shift(@_);
58
59 # If the infodbtype wasn't passed in, use the default from dbutil
60 if (!defined($infodbtype))
61 {
62 $infodbtype = &dbutil::get_default_infodb_type();
63 }
64
65 my $self = {'infodbtype' => $infodbtype,
66 'info'=>{},
67 'reverse-info'=>{},
68 'order'=>[],
69 'reverse_sort'=>0,
70 'sort'=>0};
71
72 return bless $self, $class;
73}
74
75sub _load_info_txt
76{
77 my $self = shift (@_);
78 my ($filename) = @_;
79
80 if (defined $filename && -e $filename) {
81 open (INFILE, $filename) ||
82 die "arcinfo::load_info couldn't read $filename\n";
83
84 my ($line, @line);
85 while (defined ($line = <INFILE>)) {
86 $line =~ s/\cM|\cJ//g; # remove end-of-line characters
87 @line = split ("\t", $line); # filename,
88 if (scalar(@line) >= 2) {
89 $self->add_info (@line);
90 }
91 }
92 close (INFILE);
93 }
94
95
96}
97
98sub _load_info_db
99{
100 my $self = shift (@_);
101 my ($filename) = @_;
102
103 my $infodb_map = {};
104
105 &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
106
107 foreach my $oid ( keys %$infodb_map ) {
108 my $vals = $infodb_map->{$oid};
109 # interested in doc-file and index-status
110
111 my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
112 my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
113 my ($sortmeta) = ($vals=~/^<sort-meta>(.*)$/m);
114 $self->add_info ($oid,$doc_file,$index_status,$sortmeta);
115 }
116}
117
118
119sub load_info {
120 my $self = shift (@_);
121 my ($filename) = @_;
122
123 $self->{'info'} = {};
124
125 if ((defined $filename) && (-e $filename)) {
126 if ($filename =~ m/\.inf$/) {
127 $self->_load_info_txt($filename);
128 }
129 else {
130 $self->_load_info_db($filename);
131 }
132 }
133}
134
135sub _load_filelist_db
136{
137 my $self = shift (@_);
138 my ($filename) = @_;
139
140 my $infodb_map = {};
141
142 &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
143
144 foreach my $file ( keys %$infodb_map ) {
145 $self->{'prev_import_filelist'}->{$file} = 1;
146 }
147}
148
149
150sub load_prev_import_filelist {
151 my $self = shift (@_);
152 my ($filename) = @_;
153
154 $self->{'import-filelist'} = {};
155
156 if ((defined $filename) && (-e $filename)) {
157 if ($filename =~ m/\.inf$/) {
158 # e.g. 'archives-src.inf' (which includes complete list of file
159 # from last time import.pl was run)
160 $self->_load_info_txt($filename);
161 }
162 else {
163 $self->_load_filelist_db($filename);
164 }
165 }
166}
167
168sub load_revinfo_UNTESTED
169{
170 my $self = shift (@_);
171 my ($rev_filename) = @_;
172
173 my $rev_infodb_map = {};
174
175 &dbutil::read_infodb_file($self->{'infodbtype'}, $rev_filename, $rev_infodb_map);
176
177 foreach my $srcfile ( keys %$rev_infodb_map ) {
178 my $vals = $rev_infodb_map->{$srcfile};
179
180 foreach my $OID ($vals =~ m/^<oid>(.*)$/gm) {
181 $self->add_revinfo($srcfile,$OID);
182 }
183 }
184}
185
186
187sub _save_info_txt {
188 my $self = shift (@_);
189 my ($filename) = @_;
190
191 my ($OID, $info);
192
193 open (OUTFILE, ">$filename") ||
194 die "arcinfo::save_info couldn't write $filename\n";
195
196 foreach $info (@{$self->get_OID_list()}) {
197 if (defined $info) {
198 print OUTFILE join("\t", @$info), "\n";
199 }
200 }
201 close (OUTFILE);
202}
203
204sub _save_info_db {
205 my $self = shift (@_);
206 my ($filename) = @_;
207
208 my $infodbtype = $self->{'infodbtype'};
209
210 # Not the most efficient operation, but will do for now
211
212 # read it in
213 my $infodb_map = {};
214 &dbutil::read_infodb_file($infodbtype, $filename, $infodb_map);
215
216 # change index-status values
217 foreach my $info (@{$self->get_OID_list()}) {
218 if (defined $info) {
219 my ($oid,$doc_file,$index_status) = @$info;
220 if (defined $infodb_map->{$oid}) {
221 my $vals_ref = \$infodb_map->{$oid};
222 $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
223 }
224 else {
225 print STDERR "Warning: $filename does not have key $oid\n";
226 }
227 }
228 }
229
230
231 # write out again
232 my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename);
233 foreach my $oid ( keys %$infodb_map ) {
234 my $vals = $infodb_map->{$oid};
235 &dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$vals);
236 }
237 &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle);
238
239}
240
241sub save_revinfo_db {
242 my $self = shift (@_);
243 my ($rev_filename) = @_;
244
245 # Output reverse lookup database
246
247 my $rev_infodb_map = $self->{'reverse-info'};
248 my $rev_infodb_handle
249 = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $rev_filename, "append");
250
251 foreach my $key ( keys %$rev_infodb_map ) {
252 my $val_hash = $rev_infodb_map->{$key};
253 &dbutil::write_infodb_entry($self->{'infodbtype'}, $rev_infodb_handle, $key, $val_hash);
254 }
255 &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $rev_infodb_handle);
256
257}
258
259sub save_info {
260 my $self = shift (@_);
261 my ($filename) = @_;
262
263 if ($filename =~ m/(contents)|(\.inf)$/) {
264 $self->_save_info_txt($filename);
265 }
266 else {
267 $self->_save_info_db($filename);
268 }
269}
270
271sub delete_info {
272 my $self = shift (@_);
273 my ($OID) = @_;
274
275 if (defined $self->{'info'}->{$OID}) {
276 delete $self->{'info'}->{$OID};
277
278 my $i = 0;
279 while ($i < scalar (@{$self->{'order'}})) {
280 if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
281 splice (@{$self->{'order'}}, $i, 1);
282 last;
283 }
284
285 $i ++;
286 }
287 }
288}
289
290sub add_info {
291 my $self = shift (@_);
292 my ($OID, $doc_file, $index_status, $sortmeta) = @_;
293 $sortmeta = "" unless defined $sortmeta;
294 $index_status = "I" unless defined $index_status; # I = needs indexing
295
296 if (! defined($OID)) {
297 # only happens when no files can be processed?
298 return undef;
299 }
300
301 if (defined $self->{'info'}->{$OID}) {
302 # test to see if we are in a reindex situation
303
304 my $existing_status_info = $self->get_status_info($OID);
305
306 if ($existing_status_info eq "D") {
307 # yes, we're in a reindexing situation
308 $self->delete_info ($OID);
309
310
311 # force setting to "reindex"
312 $index_status = "R";
313
314 }
315 else {
316 # some other, possibly erroneous, situation has arisen
317 # where the document already seems to exist
318 print STDERR "Warning: $OID already exists with index status $existing_status_info\n";
319 print STDERR " Deleting previous version\n";
320
321 $self->delete_info ($OID);
322 }
323 }
324
325 $self->{'info'}->{$OID} = [$doc_file,$index_status,$sortmeta];
326 push (@{$self->{'order'}}, [$OID, $sortmeta]); # ORDER_OID_INDEX and ORDER_SORT_INDEX
327
328
329}
330
331sub set_status_info {
332 my $self = shift (@_);
333 my ($OID, $index_status) = @_;
334
335 my $OID_info = $self->{'info'}->{$OID};
336 $OID_info->[INFO_STATUS_INDEX] = $index_status;
337}
338
339
340sub get_status_info {
341 my $self = shift (@_);
342 my ($OID) = @_;
343
344 my $index_status = undef;
345
346 my $OID_info = $self->{'info'}->{$OID};
347 if (defined $OID_info) {
348 $index_status = $OID_info->[INFO_STATUS_INDEX];
349 }
350 else {
351 die "Unable to find document id $OID\n";
352 }
353
354 return $index_status;
355
356}
357
358
359sub add_reverseinfo {
360 my $self = shift (@_);
361 my ($key, $OID) = @_;
362
363 my $existing_key = $self->{'reverse-info'}->{$key};
364 if (!defined $existing_key) {
365 $existing_key = {};
366 $self->{'reverse-info'}->{$key} = $existing_key;
367 }
368
369 my $existing_oid = $existing_key->{'oid'};
370 if (!defined $existing_oid) {
371 $existing_oid = [];
372 $existing_key->{'oid'} = $existing_oid;
373 }
374
375 push(@$existing_oid,$OID);
376}
377
378sub set_meta_file_flag {
379 my $self = shift (@_);
380 my ($key) = @_;
381
382 my $existing_key = $self->{'reverse-info'}->{$key};
383 if (!defined $existing_key) {
384 $existing_key = {};
385 $self->{'reverse-info'}->{$key} = $existing_key;
386 }
387
388 $existing_key->{'meta-file'} = ["1"];
389
390}
391sub reverse_sort
392{
393 my $self = shift(@_);
394 $self->{'reverse_sort'} = 1;
395}
396sub sort
397{
398 my $self = shift(@_);
399 $self->{'sort'} = 1;
400}
401
402
403# returns a list of the form [[OID, doc_file, index_status], ...]
404sub get_OID_list
405{
406 my $self = shift (@_);
407
408 my $order = $self->{'order'};
409
410 my @sorted_order;
411 if ($self->{'reverse_sort'}) {
412 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
413 } elsif ($self->{'sort'}) {
414 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
415 } else { # not sorting, don't bother
416 @sorted_order = @$order;
417 }
418
419 my @list = ();
420
421 foreach my $OID_order (@sorted_order) {
422 my $OID = $OID_order->[ORDER_OID_INDEX];
423 my $OID_info = $self->{'info'}->{$OID};
424
425 push (@list, [$OID, $OID_info->[INFO_FILE_INDEX],
426 $OID_info->[INFO_STATUS_INDEX]]);
427 }
428
429 return \@list;
430}
431
432# returns a list of the form [[doc_file, OID], ...]
433sub get_file_list {
434 my $self = shift (@_);
435
436 my $order = $self->{'order'};
437
438 my @sorted_order;
439 if ($self->{'reverse_sort'}) {
440 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
441 } elsif ($self->{'sort'}) {
442 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
443 } else { # not sorting, don't bother
444 @sorted_order = @$order;
445 }
446
447 my @list = ();
448
449 foreach my $OID_order (@sorted_order) {
450 my $OID = $OID_order->[ORDER_OID_INDEX];
451 my $OID_info = $self->{'info'}->{$OID};
452
453 push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
454 }
455
456 return \@list;
457}
458
459
460# returns a list of the form [doc_file,index_status,$sort_meta]
461sub get_info {
462 my $self = shift (@_);
463 my ($OID) = @_;
464
465 if (defined $self->{'info'}->{$OID}) {
466 return $self->{'info'}->{$OID};
467 }
468
469 return undef;
470}
471
472
473
474# returns the number of documents so far
475sub size {
476 my $self = shift (@_);
477 return (scalar(@{$self->{'order'}}));
478}
479
4801;
481
Note: See TracBrowser for help on using the repository browser.