source: main/trunk/greenstone2/perllib/arcinfo.pm@ 27697

Last change on this file since 27697 was 27697, checked in by ak19, 11 years ago

Dr Bainbridge fixed it so that the gdb files generated on Windows for diffcol match those on Linux. This actually involved changing the order in which docids appear in archiveinf-doc. This last needed the newly invented flag -sort to the ArchivesInfPlugin in combination with -sortmeta OID to import.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
RevLine 
[537]1###########################################################################
2#
3# arcinfo.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
[4]27# This module stores information about the archives. At the moment
28# this information just consists of the file name (relative to the
29# directory the archives information file is in) and its OID.
30
31# This module assumes there is a one to one correspondance between
32# a file in the archives directory and an OID.
33
34package arcinfo;
35
[10157]36use constant ORDER_OID_INDEX => 0;
37use constant ORDER_SORT_INDEX => 1;
38
39use constant INFO_FILE_INDEX => 0;
40use constant INFO_STATUS_INDEX => 1;
41
[15889]42use strict;
43
[21543]44use dbutil;
[15889]45
[21543]46
[10157]47# File format read in: OID <tab> Filename <tab> Optional-Index-Status
48
49# Index status can be:
50# I = Index for the first time
51# R = Reindex
52# D = Delete
53# B = Been indexed
54
[4]55sub new {
[21579]56 my $class = shift(@_);
57 my $infodbtype = shift(@_);
58
59 # If the infodbtype wasn't passed in, use the default from dbutil
60 if (!defined($infodbtype))
61 {
62 $infodbtype = &dbutil::get_default_infodb_type();
63 }
64
65 my $self = {'infodbtype' => $infodbtype,
66 'info'=>{},
[19774]67 'reverse-info'=>{},
[15073]68 'order'=>[],
[27697]69 'reverse_sort'=>0,
70 'sort'=>0};
[4]71
72 return bless $self, $class;
73}
74
[18441]75sub _load_info_txt
76{
[4]77 my $self = shift (@_);
78 my ($filename) = @_;
79
[12328]80 if (defined $filename && -e $filename) {
[4]81 open (INFILE, $filename) ||
82 die "arcinfo::load_info couldn't read $filename\n";
83
84 my ($line, @line);
85 while (defined ($line = <INFILE>)) {
86 $line =~ s/\cM|\cJ//g; # remove end-of-line characters
87 @line = split ("\t", $line); # filename,
88 if (scalar(@line) >= 2) {
[14]89 $self->add_info (@line);
[4]90 }
91 }
92 close (INFILE);
93 }
[18441]94
95
[4]96}
97
[21564]98sub _load_info_db
[18441]99{
[4]100 my $self = shift (@_);
101 my ($filename) = @_;
102
[18441]103 my $infodb_map = {};
104
[21579]105 &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
[18441]106
107 foreach my $oid ( keys %$infodb_map ) {
108 my $vals = $infodb_map->{$oid};
109 # interested in doc-file and index-status
110
111 my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
112 my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
[20747]113 my ($sortmeta) = ($vals=~/^<sort-meta>(.*)$/m);
114 $self->add_info ($oid,$doc_file,$index_status,$sortmeta);
[18441]115 }
116}
117
[19774]118
[18441]119sub load_info {
120 my $self = shift (@_);
121 my ($filename) = @_;
122
123 $self->{'info'} = {};
124
125 if ((defined $filename) && (-e $filename)) {
126 if ($filename =~ m/\.inf$/) {
127 $self->_load_info_txt($filename);
128 }
129 else {
[21564]130 $self->_load_info_db($filename);
[18441]131 }
132 }
133}
134
[21564]135sub _load_filelist_db
[18441]136{
137 my $self = shift (@_);
138 my ($filename) = @_;
139
140 my $infodb_map = {};
141
[21585]142 &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
[18441]143
144 foreach my $file ( keys %$infodb_map ) {
[18456]145 $self->{'prev_import_filelist'}->{$file} = 1;
[18441]146 }
147}
148
149
[18456]150sub load_prev_import_filelist {
[18441]151 my $self = shift (@_);
152 my ($filename) = @_;
153
154 $self->{'import-filelist'} = {};
155
156 if ((defined $filename) && (-e $filename)) {
157 if ($filename =~ m/\.inf$/) {
158 # e.g. 'archives-src.inf' (which includes complete list of file
159 # from last time import.pl was run)
160 $self->_load_info_txt($filename);
161 }
162 else {
[21564]163 $self->_load_filelist_db($filename);
[18441]164 }
165 }
166}
167
[19774]168sub load_revinfo_UNTESTED
169{
170 my $self = shift (@_);
171 my ($rev_filename) = @_;
172
173 my $rev_infodb_map = {};
174
[21579]175 &dbutil::read_infodb_file($self->{'infodbtype'}, $rev_filename, $rev_infodb_map);
[19774]176
177 foreach my $srcfile ( keys %$rev_infodb_map ) {
178 my $vals = $rev_infodb_map->{$srcfile};
179
180 foreach my $OID ($vals =~ m/^<oid>(.*)$/gm) {
181 $self->add_revinfo($srcfile,$OID);
182 }
183 }
184}
185
186
[18441]187sub _save_info_txt {
188 my $self = shift (@_);
189 my ($filename) = @_;
190
[14]191 my ($OID, $info);
[4]192
193 open (OUTFILE, ">$filename") ||
194 die "arcinfo::save_info couldn't write $filename\n";
[7904]195
[14]196 foreach $info (@{$self->get_OID_list()}) {
[4]197 if (defined $info) {
[14]198 print OUTFILE join("\t", @$info), "\n";
[4]199 }
200 }
201 close (OUTFILE);
202}
203
[21564]204sub _save_info_db {
[18441]205 my $self = shift (@_);
206 my ($filename) = @_;
207
[21857]208 my $infodbtype = $self->{'infodbtype'};
209
[18441]210 # Not the most efficient operation, but will do for now
211
212 # read it in
213 my $infodb_map = {};
[21857]214 &dbutil::read_infodb_file($infodbtype, $filename, $infodb_map);
[18441]215
216 # change index-status values
217 foreach my $info (@{$self->get_OID_list()}) {
218 if (defined $info) {
219 my ($oid,$doc_file,$index_status) = @$info;
220 if (defined $infodb_map->{$oid}) {
221 my $vals_ref = \$infodb_map->{$oid};
222 $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
223 }
224 else {
225 print STDERR "Warning: $filename does not have key $oid\n";
226 }
227 }
228 }
229
230
231 # write out again
[21857]232 my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename);
[18441]233 foreach my $oid ( keys %$infodb_map ) {
[21857]234 my $vals = $infodb_map->{$oid};
235 &dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$vals);
[18441]236 }
[21857]237 &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle);
[18441]238
239}
240
[21564]241sub save_revinfo_db {
[19774]242 my $self = shift (@_);
243 my ($rev_filename) = @_;
244
245 # Output reverse lookup database
246
247 my $rev_infodb_map = $self->{'reverse-info'};
248 my $rev_infodb_handle
[21579]249 = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $rev_filename, "append");
[19774]250
251 foreach my $key ( keys %$rev_infodb_map ) {
252 my $val_hash = $rev_infodb_map->{$key};
[21579]253 &dbutil::write_infodb_entry($self->{'infodbtype'}, $rev_infodb_handle, $key, $val_hash);
[19774]254 }
[21579]255 &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $rev_infodb_handle);
[19774]256
257}
258
[18441]259sub save_info {
260 my $self = shift (@_);
261 my ($filename) = @_;
262
[20537]263 if ($filename =~ m/(contents)|(\.inf)$/) {
[18441]264 $self->_save_info_txt($filename);
265 }
266 else {
[21564]267 $self->_save_info_db($filename);
[18441]268 }
269}
270
[14]271sub delete_info {
272 my $self = shift (@_);
273 my ($OID) = @_;
274
275 if (defined $self->{'info'}->{$OID}) {
276 delete $self->{'info'}->{$OID};
277
278 my $i = 0;
279 while ($i < scalar (@{$self->{'order'}})) {
[10157]280 if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
[14]281 splice (@{$self->{'order'}}, $i, 1);
282 last;
283 }
284
285 $i ++;
286 }
287 }
288}
289
[4]290sub add_info {
291 my $self = shift (@_);
[10157]292 my ($OID, $doc_file, $index_status, $sortmeta) = @_;
[1287]293 $sortmeta = "" unless defined $sortmeta;
[10157]294 $index_status = "I" unless defined $index_status; # I = needs indexing
[4]295
[3416]296 if (! defined($OID)) {
297 # only happens when no files can be processed?
298 return undef;
299 }
[10157]300
[18469]301 if (defined $self->{'info'}->{$OID}) {
302 # test to see if we are in a reindex situation
303
304 my $existing_status_info = $self->get_status_info($OID);
305
306 if ($existing_status_info eq "D") {
307 # yes, we're in a reindexing situation
308 $self->delete_info ($OID);
309
310
311 # force setting to "reindex"
312 $index_status = "R";
313
314 }
315 else {
316 # some other, possibly erroneous, situation has arisen
317 # where the document already seems to exist
318 print STDERR "Warning: $OID already exists with index status $existing_status_info\n";
319 print STDERR " Deleting previous version\n";
320
321 $self->delete_info ($OID);
322 }
323 }
324
[20747]325 $self->{'info'}->{$OID} = [$doc_file,$index_status,$sortmeta];
[27697]326 push (@{$self->{'order'}}, [$OID, $sortmeta]); # ORDER_OID_INDEX and ORDER_SORT_INDEX
[18469]327
328
[4]329}
330
[10157]331sub set_status_info {
[4]332 my $self = shift (@_);
[10157]333 my ($OID, $index_status) = @_;
[4]334
[10157]335 my $OID_info = $self->{'info'}->{$OID};
336 $OID_info->[INFO_STATUS_INDEX] = $index_status;
337}
338
339
340sub get_status_info {
341 my $self = shift (@_);
342 my ($OID) = @_;
343
344 my $index_status = undef;
345
346 my $OID_info = $self->{'info'}->{$OID};
347 if (defined $OID_info) {
348 $index_status = $OID_info->[INFO_STATUS_INDEX];
349 }
350 else {
351 die "Unable to find document id $OID\n";
352 }
353
354 return $index_status;
355
356}
357
[19774]358
359sub add_reverseinfo {
360 my $self = shift (@_);
361 my ($key, $OID) = @_;
362
363 my $existing_key = $self->{'reverse-info'}->{$key};
364 if (!defined $existing_key) {
365 $existing_key = {};
366 $self->{'reverse-info'}->{$key} = $existing_key;
367 }
368
369 my $existing_oid = $existing_key->{'oid'};
370 if (!defined $existing_oid) {
371 $existing_oid = [];
372 $existing_key->{'oid'} = $existing_oid;
373 }
374
375 push(@$existing_oid,$OID);
376}
377
[20802]378sub set_meta_file_flag {
379 my $self = shift (@_);
380 my ($key) = @_;
[19774]381
[20802]382 my $existing_key = $self->{'reverse-info'}->{$key};
383 if (!defined $existing_key) {
384 $existing_key = {};
385 $self->{'reverse-info'}->{$key} = $existing_key;
386 }
387
388 $existing_key->{'meta-file'} = ["1"];
389
390}
[15073]391sub reverse_sort
392{
393 my $self = shift(@_);
394 $self->{'reverse_sort'} = 1;
395}
[27697]396sub sort
397{
398 my $self = shift(@_);
399 $self->{'sort'} = 1;
400}
[10157]401
[27697]402
[10157]403# returns a list of the form [[OID, doc_file, index_status], ...]
404sub get_OID_list
405{
406 my $self = shift (@_);
407
408 my $order = $self->{'order'};
409
[15073]410 my @sorted_order;
411 if ($self->{'reverse_sort'}) {
412 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
[27697]413 } elsif ($self->{'sort'}) {
[15073]414 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
[27697]415 } else { # not sorting, don't bother
416 @sorted_order = @$order;
[15073]417 }
[10157]418
[4]419 my @list = ();
420
[10157]421 foreach my $OID_order (@sorted_order) {
422 my $OID = $OID_order->[ORDER_OID_INDEX];
423 my $OID_info = $self->{'info'}->{$OID};
424
425 push (@list, [$OID, $OID_info->[INFO_FILE_INDEX],
426 $OID_info->[INFO_STATUS_INDEX]]);
[4]427 }
428
429 return \@list;
430}
431
432# returns a list of the form [[doc_file, OID], ...]
433sub get_file_list {
434 my $self = shift (@_);
435
[10157]436 my $order = $self->{'order'};
437
[15073]438 my @sorted_order;
439 if ($self->{'reverse_sort'}) {
440 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
[27697]441 } elsif ($self->{'sort'}) {
[15073]442 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
[27697]443 } else { # not sorting, don't bother
444 @sorted_order = @$order;
[15073]445 }
[10157]446
[4]447 my @list = ();
448
[15889]449 foreach my $OID_order (@sorted_order) {
[10157]450 my $OID = $OID_order->[ORDER_OID_INDEX];
451 my $OID_info = $self->{'info'}->{$OID};
452
453 push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
[4]454 }
455
456 return \@list;
457}
458
459
[20747]460# returns a list of the form [doc_file,index_status,$sort_meta]
[4]461sub get_info {
462 my $self = shift (@_);
463 my ($OID) = @_;
464
465 if (defined $self->{'info'}->{$OID}) {
466 return $self->{'info'}->{$OID};
467 }
468
469 return undef;
470}
471
472
[20747]473
[98]474# returns the number of documents so far
475sub size {
476 my $self = shift (@_);
477 return (scalar(@{$self->{'order'}}));
478}
479
[4]4801;
481
Note: See TracBrowser for help on using the repository browser.