source: gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm@ 18528

Last change on this file since 18528 was 18528, checked in by davidb, 15 years ago

OIDmetadata wasn't supported in collect.cfg, but OIDtype was. Now rectified. Also introduced OIDcount as a file saved in the archives folder to help doc.pm use the correct value when working incrementally

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1###########################################################################
2#
3# ArchivesInfPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which reads through an archives.inf (or GDBM equivalent,
27# archiveinf-doc.{ldb,bdb} file (i.e. the file generated in the
28# archives directory when an import is done), processing each file it
29# finds
30
31package ArchivesInfPlugin;
32
33use util;
34use doc;
35use PrintInfo;
36use plugin;
37use arcinfo;
38use gsprintf;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43BEGIN {
44 @ArchivesInfPlugin::ISA = ('PrintInfo');
45}
46
47my $arguments = [
48 ];
49
50my $options = { 'name' => "ArchivesInfPlugin",
51 'desc' => "{ArchivesInfPlugin.desc}",
52 'abstract' => "no",
53 'inherits' => "yes" };
54
55sub gsprintf
56{
57 return &gsprintf::gsprintf(@_);
58}
59
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
66 push(@{$hashArgOptLists->{"OptList"}},$options);
67
68 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
69
70 return bless $self, $class;
71}
72
73# called once, at the start of processing
74sub init {
75 my $self = shift (@_);
76 my ($verbosity, $outhandle, $failhandle) = @_;
77
78 # verbosity is passed through from the processor
79 $self->{'verbosity'} = $verbosity;
80
81 # as are the outhandle and failhandle
82 $self->{'outhandle'} = $outhandle if defined $outhandle;
83 $self->{'failhandle'} = $failhandle;
84
85}
86
87sub deinit {
88 my ($self) = @_;
89
90 my $archive_info = $self->{'archive_info'};
91 my $verbosity = $self->{'verbosity'};
92 my $outhandle = $self->{'outhandle'};
93
94 if (defined $archive_info) {
95 my $archive_info_filename = $self->{'archive_info_filename'};
96
97 my $file_list = $archive_info->get_file_list();
98
99 foreach my $subfile (@$file_list) {
100 my $doc_oid = $subfile->[1];
101
102 my $index_status = $archive_info->get_status_info($doc_oid);
103
104 if ($index_status eq "D") {
105 # delete
106 $archive_info->delete_info($doc_oid);
107 &GDBMUtils::gdbmDatabaseRemove($archive_info_filename,$doc_oid);
108
109 my $doc_file = $subfile->[0];
110 my $base_dir =$self->{'base_dir'};
111
112 my $doc_filename = &util::filename_cat($base_dir,$doc_file);
113
114 my ($doc_tailname, $doc_dirname, $suffix)
115 = File::Basename::fileparse($doc_filename, "\\.[^\\.]+\$");
116
117 print $outhandle "Removing $doc_dirname\n" if ($verbosity>2);
118
119 &util::rm_r($doc_dirname);
120
121
122 }
123 elsif ($index_status =~ m/^(I|R)$/) {
124 # mark as "been indexed"
125 $archive_info->set_status_info($doc_oid,"B");
126 }
127 }
128
129 $archive_info->save_info($archive_info_filename);
130 }
131}
132
133# called at the beginning of each plugin pass (import has one, buildin has many)
134sub begin {
135 my $self = shift (@_);
136 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
137
138 $self->{'base_dir'} = $base_dir;
139}
140
141# called at the end of each plugin pass
142sub end {
143 my ($self) = shift (@_);
144
145}
146
147# called if we are doing incremental building
148sub set_incremental {
149 my $self = shift(@_);
150 my ($incremental) = @_;
151
152 $self->{'incremental'} = $incremental;
153}
154
155# return 1 if this class might recurse using $pluginfo
156sub is_recursive {
157 my $self = shift (@_);
158
159 return 1;
160}
161
162
163sub compile_stats {
164 my $self = shift(@_);
165 my ($stats) = @_;
166}
167
168# We don't do metadata_read
169sub metadata_read {
170 my $self = shift (@_);
171 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
172
173 return undef;
174}
175
176sub file_block_read {
177
178 my $self = shift (@_);
179 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
180
181 if ($file eq "OIDcount") {
182 my ($filename_full_path, $filename_no_path)
183 = &util::get_full_filenames($base_dir, $file);
184 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
185 return 1;
186 }
187
188 # otherwise, we don't do any file blocking
189
190 return undef;
191}
192
193
194# return number of files processed, undef if can't process
195# Note that $base_dir might be "" and that $file might
196# include directories
197sub read {
198 my $self = shift (@_);
199 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count, $gli) = @_;
200 my $outhandle = $self->{'outhandle'};
201
202 my $count = 0;
203
204 # see if this has a archives information file within it
205## my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
206 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
207 my $doc_db = "archiveinf-doc$db_ext";
208 my $archive_info_filename = &util::filename_cat($base_dir,$file,$doc_db);
209
210 if (-e $archive_info_filename) {
211
212 # found an archives.inf file
213 &gsprintf($outhandle, "ArchivesInfPlugin: {common.processing} $archive_info_filename\n") if $self->{'verbosity'} > 1;
214
215 # read in the archives information file
216 my $archive_info = new arcinfo ();
217 $self->{'archive_info'} = $archive_info;
218 $self->{'archive_info_filename'} = $archive_info_filename;
219
220 $archive_info->load_info ($archive_info_filename);
221
222 my $file_list = $archive_info->get_file_list();
223
224 # process each file
225 foreach my $subfile (@$file_list) {
226
227 last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs);
228
229 my $tmp = &util::filename_cat ($file, $subfile->[0]);
230 next if $tmp eq $file;
231
232 my $doc_oid = $subfile->[1];
233 my $index_status = $archive_info->get_status_info($doc_oid);
234
235 my $curr_mode = $processor->get_mode();
236 my $new_mode = $curr_mode;
237
238 # Start by assuming we want to process the file...
239 my $process_file = 1;
240
241 # ...unless the build processor is incremental capable and -incremental was specified, in which case we need to check its index_status flag
242 if ($processor->is_incremental_capable() && $self->{'incremental'})
243 {
244 # Check to see if the file needs indexing
245 if ($index_status eq "B")
246 {
247 # Don't process this file as it has already been indexed
248 $process_file = 0;
249 }
250 elsif ($index_status eq "D") {
251 # Need to be delete it from the index.
252 $new_mode = $curr_mode."delete";
253 $process_file = 1;
254 }
255 elsif ($index_status eq "R") {
256 # Need to be reindexed/replaced
257 $new_mode = $curr_mode."reindex";
258
259 $process_file = 1;
260 }
261 }
262 # ... or we're being asked to delete it (in which case skip it)
263 elsif ($index_status eq "D") {
264 # Non-incremental Delete
265 # It's already been deleted from the archives directory
266 # (done during import.pl)
267 # => All we need to do here is not process it
268
269 $process_file = 0;
270 }
271
272 if (!$processor->is_incremental_capable() && $self->{'incremental'}) {
273 # Nag feature
274 if (!defined $self->{'incremental-warning'}) {
275 print $outhandle "\n";
276 print $outhandle "Warning: command-line option '-incremental' used with *non-incremental*\n";
277 print $outhandle " processor '", ref $processor, "'. Some conflicts may arise.\n";
278 print $outhandle "\n";
279 sleep 10;
280 $self->{'incremental-warning'} = 1;
281 }
282 }
283
284 if ($process_file) {
285 # note: metadata is not carried on to the next level
286
287 $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
288
289 $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
290
291 $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
292 }
293 }
294
295 return $count;
296 }
297
298
299 # wasn't an archives directory, someone else will have to process it
300 return undef;
301}
302
3031;
Note: See TracBrowser for help on using the repository browser.