root/main/trunk/greenstone2/perllib/arcinfo.pm @ 28637

Revision 28637, 11.7 KB (checked in by kjdon, 6 years ago)

added an extra field to the database: group-position. When we are processing documents into grouped doc.xml files, then this field will give the position in the doc.xml file, starting with the first document at 1. When we are reading the database to find the list of files to process for indexing, we must not process items where the group-position is > 1 - we have already seen this doc.xml file once, don't process it again.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# arcinfo.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27# This module stores information about the archives. At the moment
28# this information just consists of the file name (relative to the
29# directory the archives information file is in) and its OID.
30
31# This module assumes there is a one to one correspondance between
32# a file in the archives directory and an OID.
33
34package arcinfo;
35
36use constant ORDER_OID_INDEX  => 0;
37use constant ORDER_SORT_INDEX => 1;
38
39use constant INFO_FILE_INDEX    => 0;
40use constant INFO_STATUS_INDEX  => 1;
41
42use constant INFO_GROUPPOS_INDEX  => 3;
43use strict;
44
45use dbutil;
46
47
48# File format read in: OID <tab> Filename <tab> Optional-Index-Status
49
50# Index status can be:
51#  I = Index for the first time
52#  R = Reindex
53#  D = Delete
54#  B = Been indexed
55
56sub new {
57    my $class = shift(@_);
58    my $infodbtype = shift(@_);
59
60    # If the infodbtype wasn't passed in, use the default from dbutil
61    if (!defined($infodbtype))
62    {
63      $infodbtype = &dbutil::get_default_infodb_type();
64    }
65
66    my $self = {'infodbtype' => $infodbtype,
67        'info'=>{},
68        'reverse-info'=>{},
69        'order'=>[],
70        'reverse_sort'=>0,
71        'sort'=>0};
72
73    return bless $self, $class;
74}
75
76sub _load_info_txt
77{
78    my $self = shift (@_);
79    my ($filename) = @_;
80
81    if (defined $filename && -e $filename) {
82    open (INFILE, $filename) ||
83        die "arcinfo::load_info couldn't read $filename\n";
84
85    my ($line, @line);
86    while (defined ($line = <INFILE>)) {
87        $line =~ s/\cM|\cJ//g; # remove end-of-line characters
88        @line = split ("\t", $line); # filename,
89        if (scalar(@line) >= 2) {
90        $self->add_info (@line);
91        }
92    }
93    close (INFILE);
94    }
95
96
97}
98
99sub _load_info_db
100{
101    my $self = shift (@_);
102    my ($filename) = @_;
103
104    my $infodb_map = {};
105
106    &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
107
108    foreach my $oid ( keys %$infodb_map ) {
109    my $vals = $infodb_map->{$oid};
110    # interested in doc-file and index-status
111
112    my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
113    my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
114    my ($sortmeta) = ($vals=~/^<sort-meta>(.*)$/m);
115    my ($group_position) = ($vals=~/^<group-position>(.*)$/m);
116    $self->add_info ($oid,$doc_file,$index_status,$sortmeta, $group_position);
117    }
118}
119
120
121sub load_info {
122    my $self = shift (@_);
123    my ($filename) = @_;
124
125    $self->{'info'} = {};
126
127    if ((defined $filename) && (-e $filename)) {
128    if ($filename =~ m/\.inf$/) {
129        $self->_load_info_txt($filename);
130    }
131    else {
132        $self->_load_info_db($filename);
133    }
134    }
135}
136
137sub _load_filelist_db
138{
139    my $self = shift (@_);
140    my ($filename) = @_;
141
142    my $infodb_map = {};
143
144    &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
145
146    foreach my $file ( keys %$infodb_map ) {
147    # turn placeholders in the file keys of archiveinf-src file back to absolute paths
148    $file = &util::placeholders_to_abspath($file);
149    $self->{'prev_import_filelist'}->{$file} = 1;
150    }
151}
152
153
154sub load_prev_import_filelist {
155    my $self = shift (@_);
156    my ($filename) = @_;
157
158    $self->{'import-filelist'} = {};
159
160    if ((defined $filename) && (-e $filename)) {
161    if ($filename =~ m/\.inf$/) {
162        # e.g. 'archives-src.inf' (which includes complete list of file
163        # from last time import.pl was run)
164        $self->_load_info_txt($filename);
165    }
166    else {
167        $self->_load_filelist_db($filename);
168    }
169    }
170}
171
172sub load_revinfo_UNTESTED
173{
174    my $self = shift (@_);
175    my ($rev_filename) = @_;
176
177    my $rev_infodb_map = {};
178
179    &dbutil::read_infodb_file($self->{'infodbtype'}, $rev_filename, $rev_infodb_map);
180
181    foreach my $srcfile ( keys %$rev_infodb_map ) {
182
183    my $vals = $rev_infodb_map->{$srcfile};
184
185    $srcfile = &util::abspath_to_placeholders($srcfile);
186
187    foreach my $OID ($vals =~ m/^<oid>(.*)$/gm) {
188        $self->add_revinfo($srcfile,$OID);
189    }
190    }
191}
192
193
194sub _save_info_txt {
195    my $self = shift (@_);
196    my ($filename) = @_;
197
198    my ($OID, $info);
199
200    open (OUTFILE, ">$filename") ||
201    die "arcinfo::save_info couldn't write $filename\n";
202 
203    foreach $info (@{$self->get_OID_list()}) {
204    if (defined $info) {
205        print OUTFILE join("\t", @$info), "\n";
206    }
207    }
208    close (OUTFILE);
209}
210
211sub _save_info_db {
212    my $self = shift (@_);
213    my ($filename) = @_;
214
215    my $infodbtype = $self->{'infodbtype'};
216
217    # Not the most efficient operation, but will do for now
218
219    # read it in
220    my $infodb_map = {};
221    &dbutil::read_infodb_file($infodbtype, $filename, $infodb_map);
222
223    # change index-status values
224    foreach my $info (@{$self->get_OID_list()}) {
225    if (defined $info) {
226        my ($oid,$doc_file,$index_status) = @$info;
227        if (defined $infodb_map->{$oid}) {
228        my $vals_ref = \$infodb_map->{$oid};
229        $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
230        }
231        else {
232        print STDERR "Warning: $filename does not have key $oid\n";
233        }
234    }
235    }
236
237
238    # write out again
239    my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename);
240    foreach my $oid ( keys %$infodb_map ) {
241    my $vals = $infodb_map->{$oid};
242    &dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$vals);
243    }
244    &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle);
245
246}
247
248sub save_revinfo_db {
249    my $self = shift (@_);
250    my ($rev_filename) = @_;
251
252    # Output reverse lookup database
253
254    my $rev_infodb_map = $self->{'reverse-info'};
255    my $rev_infodb_handle
256    = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $rev_filename, "append");
257
258    foreach my $key ( keys %$rev_infodb_map ) {
259    my $val_hash = $rev_infodb_map->{$key};
260
261    $key = &util::abspath_to_placeholders($key);   
262
263    &dbutil::write_infodb_entry($self->{'infodbtype'}, $rev_infodb_handle, $key, $val_hash);
264    }
265    &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $rev_infodb_handle);
266
267}
268
269sub save_info {
270    my $self = shift (@_);
271    my ($filename) = @_;
272    if ($filename =~ m/(contents)|(\.inf)$/) {
273    $self->_save_info_txt($filename);
274    }
275    else {
276    $self->_save_info_db($filename);
277    }
278}
279
280sub delete_info {
281    my $self = shift (@_);
282    my ($OID) = @_;
283
284    if (defined $self->{'info'}->{$OID}) {
285    delete $self->{'info'}->{$OID};
286   
287    my $i = 0;
288    while ($i < scalar (@{$self->{'order'}})) {
289        if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
290        splice (@{$self->{'order'}}, $i, 1);
291        last;
292        }
293       
294        $i ++;
295    }
296    }
297}
298
299sub add_info {
300    my $self = shift (@_);
301    my ($OID, $doc_file, $index_status, $sortmeta, $group_position) = @_;
302    $sortmeta = "" unless defined $sortmeta;
303    $index_status = "I" unless defined $index_status; # I = needs indexing
304    if (! defined($OID)) {
305    # only happens when no files can be processed?
306    return undef;
307    }
308
309    if (defined $self->{'info'}->{$OID}) {
310    # test to see if we are in a reindex situation
311
312    my $existing_status_info = $self->get_status_info($OID);
313
314    if ($existing_status_info eq "D") {
315        # yes, we're in a reindexing situation
316        $self->delete_info ($OID);
317
318
319        # force setting to "reindex"
320        $index_status = "R";
321
322    }
323    else {
324        # some other, possibly erroneous, situation has arisen
325        # where the document already seems to exist
326        print STDERR "Warning: $OID already exists with index status $existing_status_info\n";
327        print STDERR "         Deleting previous version\n";
328
329        $self->delete_info ($OID);
330    }
331    }
332
333    $self->{'info'}->{$OID} = [$doc_file,$index_status,$sortmeta, $group_position];
334    push (@{$self->{'order'}}, [$OID, $sortmeta]); # ORDER_OID_INDEX and ORDER_SORT_INDEX
335
336
337}
338
339sub set_status_info {
340    my $self = shift (@_);
341    my ($OID, $index_status) = @_;
342
343    my $OID_info = $self->{'info'}->{$OID};
344    $OID_info->[INFO_STATUS_INDEX] = $index_status;
345}
346
347
348sub get_status_info {
349    my $self = shift (@_);
350    my ($OID) = @_;
351
352    my $index_status = undef;
353
354    my $OID_info = $self->{'info'}->{$OID};
355    if (defined $OID_info) {
356    $index_status = $OID_info->[INFO_STATUS_INDEX];
357    }
358    else {
359    die "Unable to find document id $OID\n";
360    }
361
362    return $index_status;
363
364}
365
366sub get_group_position {
367    my $self = shift (@_);
368    my ($OID) = @_;
369
370    my $group_position = undef;
371    my $OID_info = $self->{'info'}->{$OID};
372    if (defined $OID_info) {
373    $group_position = $OID_info->[INFO_GROUPPOS_INDEX];
374    }
375    else {
376    die "Unable to find document id $OID\n";
377    }
378    return $group_position;
379   
380}
381sub add_reverseinfo {
382    my $self = shift (@_);
383    my ($key, $OID) = @_;
384
385    my $existing_key = $self->{'reverse-info'}->{$key};
386    if (!defined $existing_key) {
387    $existing_key = {};
388    $self->{'reverse-info'}->{$key} = $existing_key;
389    }
390
391    my $existing_oid = $existing_key->{'oid'}; 
392    if (!defined $existing_oid) {
393    $existing_oid = [];
394    $existing_key->{'oid'} = $existing_oid;
395    }
396
397    push(@$existing_oid,$OID);
398}
399
400sub set_meta_file_flag {
401    my $self = shift (@_);
402    my ($key) = @_;
403
404    my $existing_key = $self->{'reverse-info'}->{$key};
405    if (!defined $existing_key) {
406    $existing_key = {};
407    $self->{'reverse-info'}->{$key} = $existing_key;
408    }
409
410    $existing_key->{'meta-file'} = ["1"];
411
412}
413sub reverse_sort
414{
415    my $self = shift(@_);
416    $self->{'reverse_sort'} = 1;
417}
418sub sort
419{
420    my $self = shift(@_);
421    $self->{'sort'} = 1;
422}
423
424
425# returns a list of the form [[OID, doc_file, index_status], ...]
426sub get_OID_list
427{
428    my $self = shift (@_);
429
430    my $order = $self->{'order'};
431
432    my @sorted_order;
433    if ($self->{'reverse_sort'}) {
434    @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
435    } elsif ($self->{'sort'}) {
436    @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
437    } else { # not sorting, don't bother
438    @sorted_order = @$order;
439    }
440
441    my @list = ();
442
443    foreach my $OID_order (@sorted_order) {
444    my $OID = $OID_order->[ORDER_OID_INDEX];
445    my $OID_info = $self->{'info'}->{$OID};
446
447    push (@list, [$OID, $OID_info->[INFO_FILE_INDEX],
448              $OID_info->[INFO_STATUS_INDEX]]);
449    }
450
451    return \@list;
452}
453
454# returns a list of the form [[doc_file, OID], ...]
455sub get_file_list {
456    my $self = shift (@_);
457
458    my $order = $self->{'order'};
459
460    my @sorted_order;
461    if ($self->{'reverse_sort'}) {
462    @sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
463    } elsif ($self->{'sort'}) {
464    @sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
465    } else { # not sorting, don't bother
466    @sorted_order = @$order;
467    }
468
469    my @list = ();
470
471    foreach my $OID_order (@sorted_order) {
472    my $OID = $OID_order->[ORDER_OID_INDEX];
473    my $OID_info = $self->{'info'}->{$OID};
474
475    push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
476    }
477
478    return \@list;
479}
480
481
482# returns a list of the form [doc_file,index_status,$sort_meta, $group_position]
483sub get_info {
484    my $self = shift (@_);
485    my ($OID) = @_;
486
487    if (defined $self->{'info'}->{$OID}) {
488    return $self->{'info'}->{$OID};
489    }
490
491    return undef;
492}
493
494
495
496# returns the number of documents so far
497sub size {
498    my $self = shift (@_);
499    return (scalar(@{$self->{'order'}}));
500}
501
5021;
503
Note: See TracBrowser for help on using the browser.