source: gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm@ 18456

Last change on this file since 18456 was 18456, checked in by davidb, 15 years ago

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# ArchivesInfPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which reads through an archives.inf (or GDBM equivalent,
27# archiveinf-doc.{ldb,bdb} file (i.e. the file generated in the
28# archives directory when an import is done), processing each file it
29# finds
30
31package ArchivesInfPlugin;
32
33use util;
34use PrintInfo;
35use plugin;
36use arcinfo;
37use gsprintf;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42BEGIN {
43 @ArchivesInfPlugin::ISA = ('PrintInfo');
44}
45
46my $arguments = [
47 ];
48
49my $options = { 'name' => "ArchivesInfPlugin",
50 'desc' => "{ArchivesInfPlugin.desc}",
51 'abstract' => "no",
52 'inherits' => "yes" };
53
54sub gsprintf
55{
56 return &gsprintf::gsprintf(@_);
57}
58
59sub new {
60 my ($class) = shift (@_);
61 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62 push(@$pluginlist, $class);
63
64 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65 push(@{$hashArgOptLists->{"OptList"}},$options);
66
67 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
68
69 return bless $self, $class;
70}
71
72# called once, at the start of processing
73sub init {
74 my $self = shift (@_);
75 my ($verbosity, $outhandle, $failhandle) = @_;
76
77 # verbosity is passed through from the processor
78 $self->{'verbosity'} = $verbosity;
79
80 # as are the outhandle and failhandle
81 $self->{'outhandle'} = $outhandle if defined $outhandle;
82 $self->{'failhandle'} = $failhandle;
83
84}
85
86sub deinit {
87 my ($self) = @_;
88
89 my $archive_info = $self->{'archive_info'};
90
91 if (defined $archive_info) {
92 print STDERR "********* have parsed and processed an archive info file\n";
93
94 my $archive_info_filename = $self->{'archive_info_filename'};
95
96 my $file_list = $archive_info->get_file_list();
97
98 foreach my $subfile (@$file_list) {
99 my $doc_oid = $subfile->[1];
100
101 my $index_status = $archive_info->get_status_info($doc_oid);
102 print STDERR "*** Updating $doc_oid $index_status\n";
103
104 if ($index_status eq "D") {
105 # delete
106 $archive_info->delete_info($doc_oid);
107 }
108 elsif ($index_status =~ m/^(I|R)$/) {
109 # mark as "been indexed"
110 $archive_info->set_status_info($doc_oid,"B");
111 }
112 }
113
114 $archive_info->save_info($archive_info_filename);
115 }
116}
117
118# called at the beginning of each plugin pass (import has one, buildin has many)
119sub begin {
120 my $self = shift (@_);
121 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
122
123}
124
125# called at the end of each plugin pass
126sub end {
127 my ($self) = shift (@_);
128
129}
130
131# called if we are doing incremental building
132sub set_incremental {
133 my $self = shift(@_);
134 my ($incremental) = @_;
135
136 $self->{'incremental'} = $incremental;
137}
138
139# return 1 if this class might recurse using $pluginfo
140sub is_recursive {
141 my $self = shift (@_);
142
143 return 1;
144}
145
146
147sub compile_stats {
148 my $self = shift(@_);
149 my ($stats) = @_;
150}
151
152# We don't do metadata_read
153sub metadata_read {
154 my $self = shift (@_);
155 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
156
157 return undef;
158}
159
160# we don't do any file blocking
161sub file_block_read {
162
163 my $self = shift (@_);
164 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
165
166 return undef;
167}
168
169
170# return number of files processed, undef if can't process
171# Note that $base_dir might be "" and that $file might
172# include directories
173sub read {
174 my $self = shift (@_);
175 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count, $gli) = @_;
176 my $outhandle = $self->{'outhandle'};
177
178 my $count = 0;
179
180 # see if this has a archives information file within it
181## my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
182 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
183 my $doc_db = "archiveinf-doc$db_ext";
184 my $archive_info_filename = &util::filename_cat($base_dir,$file,$doc_db);
185
186 if (-e $archive_info_filename) {
187
188 # found an archives.inf file
189 &gsprintf($outhandle, "ArchivesInfPlugin: {common.processing} $archive_info_filename\n") if $self->{'verbosity'} > 1;
190
191 # read in the archives information file
192 my $archive_info = new arcinfo ();
193 $self->{'archive_info'} = $archive_info;
194 $self->{'archive_info_filename'} = $archive_info_filename;
195
196 $archive_info->load_info ($archive_info_filename);
197
198 my $file_list = $archive_info->get_file_list();
199
200 # process each file
201 foreach my $subfile (@$file_list) {
202
203 last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs);
204
205 my $tmp = &util::filename_cat ($file, $subfile->[0]);
206 next if $tmp eq $file;
207
208 my $doc_oid = $subfile->[1];
209 my $index_status = $archive_info->get_status_info($doc_oid);
210
211 my $curr_mode = $processor->get_mode();
212 my $new_mode = $curr_mode;
213
214 # Start by assuming we want to process the file...
215 my $process_file = 1;
216
217 # ...unless the build processor is incremental capable and -incremental was specified
218 if ($processor->is_incremental_capable() && $self->{'incremental'})
219 {
220 # Check to see if the file needs indexing
221 if ($index_status eq "B")
222 {
223 # Don't process this file as it has already been indexed
224 $process_file = 0;
225 }
226 elsif ($index_status eq "D") {
227 # Need to be delete it from the index.
228 $new_mode = $curr_mode."delete";
229 $process_file = 1;
230 }
231 elsif ($index_status eq "R") {
232 # Need to be delete it from the index.
233 $new_mode = $curr_mode."reindex";
234 $process_file = 1;
235 }
236 }
237 # ... or we're being asked to delete it (in which case skip it)
238 elsif ($index_status eq "D") {
239 # Delete it somehow from archives dir!!
240 # => get short name, lop off filename, concat archivedir
241 # move to recyle bin
242
243 $process_file = 0;
244 }
245
246 if ($process_file) {
247 # note: metadata is not carried on to the next level
248
249 $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
250
251 $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
252
253 $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
254 }
255 }
256
257 return $count;
258 }
259
260 # wasn't an archives directory, someone else will have to process it
261 return undef;
262}
263
2641;
Note: See TracBrowser for help on using the repository browser.