root/gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm @ 18456

Revision 18456, 7.1 KB (checked in by davidb, 12 years ago)

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ArchivesInfPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which reads through an archives.inf (or GDBM equivalent,
27# archiveinf-doc.{ldb,bdb} file (i.e. the file generated in the
28# archives directory when an import is done), processing each file it
29# finds
30
31package ArchivesInfPlugin;
32
33use util;
34use PrintInfo;
35use plugin;
36use arcinfo;
37use gsprintf;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42BEGIN {
43    @ArchivesInfPlugin::ISA = ('PrintInfo');
44}
45
46my $arguments = [
47         ];
48
49my $options = { 'name'     => "ArchivesInfPlugin",
50        'desc'     => "{ArchivesInfPlugin.desc}",
51        'abstract' => "no",
52        'inherits' => "yes" };
53         
54sub gsprintf
55{
56    return &gsprintf::gsprintf(@_);
57}
58
59sub new {
60    my ($class) = shift (@_);
61    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62    push(@$pluginlist, $class);
63
64    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65    push(@{$hashArgOptLists->{"OptList"}},$options);
66
67    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
68
69    return bless $self, $class;
70}
71
72# called once, at the start of processing
73sub init {
74    my $self = shift (@_);
75    my ($verbosity, $outhandle, $failhandle) = @_;
76
77    # verbosity is passed through from the processor
78    $self->{'verbosity'} = $verbosity;
79
80    # as are the outhandle and failhandle
81    $self->{'outhandle'} = $outhandle if defined $outhandle;
82    $self->{'failhandle'} = $failhandle;
83
84}
85
86sub deinit {
87    my ($self) = @_;
88
89    my $archive_info = $self->{'archive_info'};
90
91    if (defined $archive_info) {
92    print STDERR "********* have parsed and processed an archive info file\n";
93
94    my $archive_info_filename = $self->{'archive_info_filename'};
95
96        my $file_list = $archive_info->get_file_list();
97
98    foreach my $subfile (@$file_list) {     
99        my $doc_oid = $subfile->[1];
100
101        my $index_status = $archive_info->get_status_info($doc_oid);
102        print STDERR "*** Updating $doc_oid $index_status\n";
103
104        if ($index_status eq "D") {
105        # delete
106        $archive_info->delete_info($doc_oid);
107        }
108        elsif ($index_status =~ m/^(I|R)$/) {
109        # mark as "been indexed"
110        $archive_info->set_status_info($doc_oid,"B");
111        }
112    }
113
114    $archive_info->save_info($archive_info_filename);
115    }
116}
117
118# called at the beginning of each plugin pass (import has one, buildin has many)
119sub begin {
120    my $self = shift (@_);
121    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
122
123}
124
125# called at the end of each plugin pass
126sub end {
127    my ($self) = shift (@_);
128
129}
130
131# called if we are doing incremental building
132sub set_incremental {
133    my $self = shift(@_);
134    my ($incremental) = @_;
135
136    $self->{'incremental'} = $incremental;
137}
138
139# return 1 if this class might recurse using $pluginfo
140sub is_recursive {
141    my $self = shift (@_);
142
143    return 1;
144}
145
146
147sub compile_stats {
148    my $self = shift(@_);
149    my ($stats) = @_;
150}
151
152# We don't do metadata_read
153sub metadata_read {
154    my $self = shift (@_);
155    my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
156
157    return undef;
158}
159
160# we don't do any file blocking
161sub file_block_read {
162
163    my $self = shift (@_); 
164    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
165
166    return undef;
167}
168
169
170# return number of files processed, undef if can't process
171# Note that $base_dir might be "" and that $file might
172# include directories
173sub read {
174    my $self = shift (@_);
175    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count, $gli) = @_;
176    my $outhandle = $self->{'outhandle'};
177
178    my $count = 0;
179
180    # see if this has a archives information file within it
181##    my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
182    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
183    my $doc_db = "archiveinf-doc$db_ext";
184    my $archive_info_filename = &util::filename_cat($base_dir,$file,$doc_db);
185
186    if (-e $archive_info_filename) {
187
188    # found an archives.inf file
189    &gsprintf($outhandle, "ArchivesInfPlugin: {common.processing} $archive_info_filename\n") if $self->{'verbosity'} > 1;
190
191    # read in the archives information file
192    my $archive_info = new arcinfo ();
193    $self->{'archive_info'} = $archive_info;
194    $self->{'archive_info_filename'} = $archive_info_filename;
195
196    $archive_info->load_info ($archive_info_filename);
197   
198    my $file_list = $archive_info->get_file_list();
199
200    # process each file
201    foreach my $subfile (@$file_list) {
202
203        last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs);
204
205        my $tmp = &util::filename_cat ($file, $subfile->[0]);
206        next if $tmp eq $file;
207
208        my $doc_oid = $subfile->[1];
209        my $index_status = $archive_info->get_status_info($doc_oid);
210
211        my $curr_mode = $processor->get_mode();
212        my $new_mode = $curr_mode;
213
214        # Start by assuming we want to process the file...
215        my $process_file = 1;
216
217        # ...unless the build processor is incremental capable and -incremental was specified
218        if ($processor->is_incremental_capable() && $self->{'incremental'})
219        {
220            # Check to see if the file needs indexing
221        if ($index_status eq "B")
222        {
223            # Don't process this file as it has already been indexed
224            $process_file = 0;
225        }
226        elsif ($index_status eq "D") {
227            # Need to be delete it from the index.
228            $new_mode = $curr_mode."delete";
229            $process_file = 1;
230        }
231        elsif ($index_status eq "R") {
232            # Need to be delete it from the index.
233            $new_mode = $curr_mode."reindex";
234            $process_file = 1;
235        }
236        }
237        # ... or we're being asked to delete it (in which case skip it)
238        elsif ($index_status eq "D") {
239        # Delete it somehow from archives dir!!
240        # => get short name, lop off filename, concat archivedir
241        # move to recyle bin
242
243        $process_file = 0;
244        }
245
246        if ($process_file) {
247        # note: metadata is not carried on to the next level
248       
249        $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
250
251        $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
252
253        $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
254        }
255    }
256
257    return $count;
258    }
259
260    # wasn't an archives directory, someone else will have to process it
261    return undef;
262}
263
2641;
Note: See TracBrowser for help on using the browser.