source: main/trunk/greenstone2/perllib/arcinfo.pm@ 28637

Last change on this file since 28637 was 28637, checked in by kjdon, 10 years ago

added an extra field to the database: group-position. When we are processing documents into grouped doc.xml files, then this field will give the position in the doc.xml file, starting with the first document at 1. When we are reading the database to find the list of files to process for indexing, we must not process items where the group-position is > 1 - we have already seen this doc.xml file once, don't process it again.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1###########################################################################
2#
3# arcinfo.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27# This module stores information about the archives. At the moment
28# this information just consists of the file name (relative to the
29# directory the archives information file is in) and its OID.
30
31# This module assumes there is a one to one correspondance between
32# a file in the archives directory and an OID.
33
34package arcinfo;
35
36use constant ORDER_OID_INDEX => 0;
37use constant ORDER_SORT_INDEX => 1;
38
39use constant INFO_FILE_INDEX => 0;
40use constant INFO_STATUS_INDEX => 1;
41
42use constant INFO_GROUPPOS_INDEX => 3;
43use strict;
44
45use dbutil;
46
47
48# File format read in: OID <tab> Filename <tab> Optional-Index-Status
49
50# Index status can be:
51# I = Index for the first time
52# R = Reindex
53# D = Delete
54# B = Been indexed
55
56sub new {
57 my $class = shift(@_);
58 my $infodbtype = shift(@_);
59
60 # If the infodbtype wasn't passed in, use the default from dbutil
61 if (!defined($infodbtype))
62 {
63 $infodbtype = &dbutil::get_default_infodb_type();
64 }
65
66 my $self = {'infodbtype' => $infodbtype,
67 'info'=>{},
68 'reverse-info'=>{},
69 'order'=>[],
70 'reverse_sort'=>0,
71 'sort'=>0};
72
73 return bless $self, $class;
74}
75
76sub _load_info_txt
77{
78 my $self = shift (@_);
79 my ($filename) = @_;
80
81 if (defined $filename && -e $filename) {
82 open (INFILE, $filename) ||
83 die "arcinfo::load_info couldn't read $filename\n";
84
85 my ($line, @line);
86 while (defined ($line = <INFILE>)) {
87 $line =~ s/\cM|\cJ//g; # remove end-of-line characters
88 @line = split ("\t", $line); # filename,
89 if (scalar(@line) >= 2) {
90 $self->add_info (@line);
91 }
92 }
93 close (INFILE);
94 }
95
96
97}
98
99sub _load_info_db
100{
101 my $self = shift (@_);
102 my ($filename) = @_;
103
104 my $infodb_map = {};
105
106 &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
107
108 foreach my $oid ( keys %$infodb_map ) {
109 my $vals = $infodb_map->{$oid};
110 # interested in doc-file and index-status
111
112 my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
113 my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
114 my ($sortmeta) = ($vals=~/^<sort-meta>(.*)$/m);
115 my ($group_position) = ($vals=~/^<group-position>(.*)$/m);
116 $self->add_info ($oid,$doc_file,$index_status,$sortmeta, $group_position);
117 }
118}
119
120
121sub load_info {
122 my $self = shift (@_);
123 my ($filename) = @_;
124
125 $self->{'info'} = {};
126
127 if ((defined $filename) && (-e $filename)) {
128 if ($filename =~ m/\.inf$/) {
129 $self->_load_info_txt($filename);
130 }
131 else {
132 $self->_load_info_db($filename);
133 }
134 }
135}
136
137sub _load_filelist_db
138{
139 my $self = shift (@_);
140 my ($filename) = @_;
141
142 my $infodb_map = {};
143
144 &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
145
146 foreach my $file ( keys %$infodb_map ) {
147 # turn placeholders in the file keys of archiveinf-src file back to absolute paths
148 $file = &util::placeholders_to_abspath($file);
149 $self->{'prev_import_filelist'}->{$file} = 1;
150 }
151}
152
153
154sub load_prev_import_filelist {
155 my $self = shift (@_);
156 my ($filename) = @_;
157
158 $self->{'import-filelist'} = {};
159
160 if ((defined $filename) && (-e $filename)) {
161 if ($filename =~ m/\.inf$/) {
162 # e.g. 'archives-src.inf' (which includes complete list of file
163 # from last time import.pl was run)
164 $self->_load_info_txt($filename);
165 }
166 else {
167 $self->_load_filelist_db($filename);
168 }
169 }
170}
171
172sub load_revinfo_UNTESTED
173{
174 my $self = shift (@_);
175 my ($rev_filename) = @_;
176
177 my $rev_infodb_map = {};
178
179 &dbutil::read_infodb_file($self->{'infodbtype'}, $rev_filename, $rev_infodb_map);
180
181 foreach my $srcfile ( keys %$rev_infodb_map ) {
182
183 my $vals = $rev_infodb_map->{$srcfile};
184
185 $srcfile = &util::abspath_to_placeholders($srcfile);
186
187 foreach my $OID ($vals =~ m/^<oid>(.*)$/gm) {
188 $self->add_revinfo($srcfile,$OID);
189 }
190 }
191}
192
193
194sub _save_info_txt {
195 my $self = shift (@_);
196 my ($filename) = @_;
197
198 my ($OID, $info);
199
200 open (OUTFILE, ">$filename") ||
201 die "arcinfo::save_info couldn't write $filename\n";
202
203 foreach $info (@{$self->get_OID_list()}) {
204 if (defined $info) {
205 print OUTFILE join("\t", @$info), "\n";
206 }
207 }
208 close (OUTFILE);
209}
210
211sub _save_info_db {
212 my $self = shift (@_);
213 my ($filename) = @_;
214
215 my $infodbtype = $self->{'infodbtype'};
216
217 # Not the most efficient operation, but will do for now
218
219 # read it in
220 my $infodb_map = {};
221 &dbutil::read_infodb_file($infodbtype, $filename, $infodb_map);
222
223 # change index-status values
224 foreach my $info (@{$self->get_OID_list()}) {
225 if (defined $info) {
226 my ($oid,$doc_file,$index_status) = @$info;
227 if (defined $infodb_map->{$oid}) {
228 my $vals_ref = \$infodb_map->{$oid};
229 $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
230 }
231 else {
232 print STDERR "Warning: $filename does not have key $oid\n";
233 }
234 }
235 }
236
237
238 # write out again
239 my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename);
240 foreach my $oid ( keys %$infodb_map ) {
241 my $vals = $infodb_map->{$oid};
242 &dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$vals);
243 }
244 &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle);
245
246}
247
248sub save_revinfo_db {
249 my $self = shift (@_);
250 my ($rev_filename) = @_;
251
252 # Output reverse lookup database
253
254 my $rev_infodb_map = $self->{'reverse-info'};
255 my $rev_infodb_handle
256 = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $rev_filename, "append");
257
258 foreach my $key ( keys %$rev_infodb_map ) {
259 my $val_hash = $rev_infodb_map->{$key};
260
261 $key = &util::abspath_to_placeholders($key);
262
263 &dbutil::write_infodb_entry($self->{'infodbtype'}, $rev_infodb_handle, $key, $val_hash);
264 }
265 &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $rev_infodb_handle);
266
267}
268
269sub save_info {
270 my $self = shift (@_);
271 my ($filename) = @_;
272 if ($filename =~ m/(contents)|(\.inf)$/) {
273 $self->_save_info_txt($filename);
274 }
275 else {
276 $self->_save_info_db($filename);
277 }
278}
279
280sub delete_info {
281 my $self = shift (@_);
282 my ($OID) = @_;
283
284 if (defined $self->{'info'}->{$OID}) {
285 delete $self->{'info'}->{$OID};
286
287 my $i = 0;
288 while ($i < scalar (@{$self->{'order'}})) {
289 if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
290 splice (@{$self->{'order'}}, $i, 1);
291 last;
292 }
293
294 $i ++;
295 }
296 }
297}
298
299sub add_info {
300 my $self = shift (@_);
301 my ($OID, $doc_file, $index_status, $sortmeta, $group_position) = @_;
302 $sortmeta = "" unless defined $sortmeta;
303 $index_status = "I" unless defined $index_status; # I = needs indexing
304 if (! defined($OID)) {
305 # only happens when no files can be processed?
306 return undef;
307 }
308
309 if (defined $self->{'info'}->{$OID}) {
310 # test to see if we are in a reindex situation
311
312 my $existing_status_info = $self->get_status_info($OID);
313
314 if ($existing_status_info eq "D") {
315 # yes, we're in a reindexing situation
316 $self->delete_info ($OID);
317
318
319 # force setting to "reindex"
320 $index_status = "R";
321
322 }
323 else {
324 # some other, possibly erroneous, situation has arisen
325 # where the document already seems to exist
326 print STDERR "Warning: $OID already exists with index status $existing_status_info\n";
327 print STDERR " Deleting previous version\n";
328
329 $self->delete_info ($OID);
330 }
331 }
332
333 $self->{'info'}->{$OID} = [$doc_file,$index_status,$sortmeta, $group_position];
334 push (@{$self->{'order'}}, [$OID, $sortmeta]); # ORDER_OID_INDEX and ORDER_SORT_INDEX
335
336
337}
338
339sub set_status_info {
340 my $self = shift (@_);
341 my ($OID, $index_status) = @_;
342
343 my $OID_info = $self->{'info'}->{$OID};
344 $OID_info->[INFO_STATUS_INDEX] = $index_status;
345}
346
347
348sub get_status_info {
349 my $self = shift (@_);
350 my ($OID) = @_;
351
352 my $index_status = undef;
353
354 my $OID_info = $self->{'info'}->{$OID};
355 if (defined $OID_info) {
356 $index_status = $OID_info->[INFO_STATUS_INDEX];
357 }
358 else {
359 die "Unable to find document id $OID\n";
360 }
361
362 return $index_status;
363
364}
365
366sub get_group_position {
367 my $self = shift (@_);
368 my ($OID) = @_;
369
370 my $group_position = undef;
371 my $OID_info = $self->{'info'}->{$OID};
372 if (defined $OID_info) {
373 $group_position = $OID_info->[INFO_GROUPPOS_INDEX];
374 }
375 else {
376 die "Unable to find document id $OID\n";
377 }
378 return $group_position;
379
380}
381sub add_reverseinfo {
382 my $self = shift (@_);
383 my ($key, $OID) = @_;
384
385 my $existing_key = $self->{'reverse-info'}->{$key};
386 if (!defined $existing_key) {
387 $existing_key = {};
388 $self->{'reverse-info'}->{$key} = $existing_key;
389 }
390
391 my $existing_oid = $existing_key->{'oid'};
392 if (!defined $existing_oid) {
393 $existing_oid = [];
394 $existing_key->{'oid'} = $existing_oid;
395 }
396
397 push(@$existing_oid,$OID);
398}
399
400sub set_meta_file_flag {
401 my $self = shift (@_);
402 my ($key) = @_;
403
404 my $existing_key = $self->{'reverse-info'}->{$key};
405 if (!defined $existing_key) {
406 $existing_key = {};
407 $self->{'reverse-info'}->{$key} = $existing_key;
408 }
409
410 $existing_key->{'meta-file'} = ["1"];
411
412}
413sub reverse_sort
414{
415 my $self = shift(@_);
416 $self->{'reverse_sort'} = 1;
417}
418sub sort
419{
420 my $self = shift(@_);
421 $self->{'sort'} = 1;
422}
423
424
425# returns a list of the form [[OID, doc_file, index_status], ...]
426sub get_OID_list
427{
428 my $self = shift (@_);
429
430 my $order = $self->{'order'};
431
432 my @sorted_order;
433 if ($self->{'reverse_sort'}) {
434 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
435 } elsif ($self->{'sort'}) {
436 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
437 } else { # not sorting, don't bother
438 @sorted_order = @$order;
439 }
440
441 my @list = ();
442
443 foreach my $OID_order (@sorted_order) {
444 my $OID = $OID_order->[ORDER_OID_INDEX];
445 my $OID_info = $self->{'info'}->{$OID};
446
447 push (@list, [$OID, $OID_info->[INFO_FILE_INDEX],
448 $OID_info->[INFO_STATUS_INDEX]]);
449 }
450
451 return \@list;
452}
453
454# returns a list of the form [[doc_file, OID], ...]
455sub get_file_list {
456 my $self = shift (@_);
457
458 my $order = $self->{'order'};
459
460 my @sorted_order;
461 if ($self->{'reverse_sort'}) {
462 @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
463 } elsif ($self->{'sort'}) {
464 @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
465 } else { # not sorting, don't bother
466 @sorted_order = @$order;
467 }
468
469 my @list = ();
470
471 foreach my $OID_order (@sorted_order) {
472 my $OID = $OID_order->[ORDER_OID_INDEX];
473 my $OID_info = $self->{'info'}->{$OID};
474
475 push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
476 }
477
478 return \@list;
479}
480
481
482# returns a list of the form [doc_file,index_status,$sort_meta, $group_position]
483sub get_info {
484 my $self = shift (@_);
485 my ($OID) = @_;
486
487 if (defined $self->{'info'}->{$OID}) {
488 return $self->{'info'}->{$OID};
489 }
490
491 return undef;
492}
493
494
495
496# returns the number of documents so far
497sub size {
498 my $self = shift (@_);
499 return (scalar(@{$self->{'order'}}));
500}
501
5021;
503
Note: See TracBrowser for help on using the repository browser.