source: main/trunk/greenstone2/perllib/plugins/ArchivesInfPlugin.pm@ 27697

Last change on this file since 27697 was 27697, checked in by ak19, 11 years ago

Dr Bainbridge fixed it so that the gdb files generated on Windows for diffcol match those on Linux. This actually involved changing the order in which docids appear in archiveinf-doc. This last needed the newly invented flag -sort to the ArchivesInfPlugin in combination with -sortmeta OID to import.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1###########################################################################
2#
3# ArchivesInfPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which reads through an archives.inf (or archiveinf-doc info database equivalent)
27# -- i.e. the file generated in the archives directory when an import is done),
28# processing each file it finds
29
30package ArchivesInfPlugin;
31
32use util;
33use FileUtils;
34use doc;
35use PrintInfo;
36use plugin;
37use arcinfo;
38use gsprintf;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43
44BEGIN {
45 @ArchivesInfPlugin::ISA = ('PrintInfo');
46}
47
48my $arguments = [
49 { 'name' => "reversesort",
50 'desc' => "{ArchivesInfPlugin.reversesort}",
51 'type' => "flag",
52 'reqd' => "no",
53 'modegli' => "2" },
54 { 'name' => "sort",
55 'desc' => "{ArchivesInfPlugin.sort}",
56 'type' => "flag",
57 'reqd' => "no",
58 'modegli' => "2" }
59
60 ];
61
62my $options = { 'name' => "ArchivesInfPlugin",
63 'desc' => "{ArchivesInfPlugin.desc}",
64 'abstract' => "no",
65 'inherits' => "yes",
66 'args' => $arguments};
67
68sub gsprintf
69{
70 return &gsprintf::gsprintf(@_);
71}
72
73sub new {
74 my ($class) = shift (@_);
75 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
76 push(@$pluginlist, $class);
77
78 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
79 push(@{$hashArgOptLists->{"OptList"}},$options);
80
81 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
82
83 return bless $self, $class;
84}
85
86# called once, at the start of processing
87sub init {
88 my $self = shift (@_);
89 my ($verbosity, $outhandle, $failhandle) = @_;
90
91 # verbosity is passed through from the processor
92 $self->{'verbosity'} = $verbosity;
93
94 # as are the outhandle and failhandle
95 $self->{'outhandle'} = $outhandle if defined $outhandle;
96 $self->{'failhandle'} = $failhandle;
97
98}
99
100sub deinit {
101 my ($self) = @_;
102
103 my $archive_info = $self->{'archive_info'};
104 my $verbosity = $self->{'verbosity'};
105 my $outhandle = $self->{'outhandle'};
106
107 if (defined $archive_info) {
108 # Get the infodbtype value for this collection from the arcinfo object
109 my $infodbtype = $archive_info->{'infodbtype'};
110 my $archive_info_filename = $self->{'archive_info_filename'};
111 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $archive_info_filename, "append");
112
113 my $file_list = $archive_info->get_file_list();
114
115 foreach my $subfile (@$file_list) {
116 my $doc_oid = $subfile->[1];
117
118 my $index_status = $archive_info->get_status_info($doc_oid);
119
120 if ($index_status eq "D") {
121 # delete
122 $archive_info->delete_info($doc_oid);
123 &dbutil::delete_infodb_entry($infodbtype, $infodb_file_handle, $doc_oid);
124
125 my $doc_file = $subfile->[0];
126 my $base_dir =$self->{'base_dir'};
127
128 my $doc_filename = &FileUtils::filenameConcatenate($base_dir,$doc_file);
129
130 my ($doc_tailname, $doc_dirname, $suffix)
131 = File::Basename::fileparse($doc_filename, "\\.[^\\.]+\$");
132
133 print $outhandle "Removing $doc_dirname\n" if ($verbosity>2);
134
135 &util::rm_r($doc_dirname);
136
137
138 }
139 elsif ($index_status =~ m/^(I|R)$/) {
140 # mark as "been indexed"
141 $archive_info->set_status_info($doc_oid,"B");
142 }
143 }
144
145 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
146 $archive_info->save_info($archive_info_filename);
147 }
148}
149
150# called at the beginning of each plugin pass (import has one, buildin has many)
151sub begin {
152 my $self = shift (@_);
153 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
154
155 $self->{'base_dir'} = $base_dir;
156}
157
158sub remove_all {
159 my $self = shift (@_);
160 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
161}
162
163sub remove_one {
164 my $self = shift (@_);
165 my ($file, $oids, $archivedir) = @_;
166 return undef; # only called during import at this stage, this will never be processing a file
167
168}
169
170
171# called at the end of each plugin pass
172sub end {
173 my ($self) = shift (@_);
174
175}
176
177
178# return 1 if this class might recurse using $pluginfo
179sub is_recursive {
180 my $self = shift (@_);
181
182 return 1;
183}
184
185
186sub compile_stats {
187 my $self = shift(@_);
188 my ($stats) = @_;
189}
190
191# We don't do metadata_read
192sub metadata_read {
193 my $self = shift (@_);
194 my ($pluginfo, $base_dir, $file, $block_hash,
195 $extrametakeys, $extrametadata, $extrametafile,
196 $processor, $gli, $aux) = @_;
197
198 return undef;
199}
200
201sub file_block_read {
202
203 my $self = shift (@_);
204 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
205
206 if ($file eq "OIDcount") {
207 my ($filename_full_path, $filename_no_path)
208 = &util::get_full_filenames($base_dir, $file);
209 &util::block_filename($block_hash,$filename_full_path);
210 return 1;
211 }
212
213 # otherwise, we don't do any file blocking
214
215 return undef;
216}
217
218
219# return number of files processed, undef if can't process
220# Note that $base_dir might be "" and that $file might
221# include directories
222sub read {
223 my $self = shift (@_);
224 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count, $gli) = @_;
225 my $outhandle = $self->{'outhandle'};
226
227 my $count = 0;
228
229 # This function only makes sense at build-time
230 return if (ref($processor) !~ /buildproc$/i);
231
232 # Get the infodbtype value for this collection from the buildproc object
233 my $infodbtype = $processor->{'infodbtype'};
234 $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz";
235
236 # see if this has a archives information file within it
237## my $archive_info_filename = &FileUtils::filenameConcatenate($base_dir,$file,"archives.inf");
238 my $archive_info_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", &FileUtils::filenameConcatenate($base_dir, $file));
239
240 if (-e $archive_info_filename) {
241
242 # found an archives.inf file
243 &gsprintf($outhandle, "ArchivesInfPlugin: {common.processing} $archive_info_filename\n") if $self->{'verbosity'} > 1;
244
245 # read in the archives information file
246 my $archive_info = new arcinfo($infodbtype);
247 $self->{'archive_info'} = $archive_info;
248 $self->{'archive_info_filename'} = $archive_info_filename;
249 if ($self->{'reversesort'}) {
250 $archive_info->reverse_sort();
251 } elsif ($self->{'sort'}) {
252 $archive_info->sort();
253 }
254
255 $archive_info->load_info ($archive_info_filename);
256
257 my $file_list = $archive_info->get_file_list();
258
259 # process each file
260 foreach my $subfile (@$file_list) {
261
262 last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs);
263
264 my $tmp = &FileUtils::filenameConcatenate($file, $subfile->[0]);
265 next if $tmp eq $file;
266
267 my $doc_oid = $subfile->[1];
268 my $index_status = $archive_info->get_status_info($doc_oid);
269
270 my $curr_mode = $processor->get_mode();
271 my $new_mode = $curr_mode;
272
273 # Start by assuming we want to process the file...
274 my $process_file = 1;
275
276 # ...unless the build processor is incremental capable and -incremental was specified, in which case we need to check its index_status flag
277 if ($processor->is_incremental_capable() && $self->{'incremental'})
278 {
279 # Check to see if the file needs indexing
280 if ($index_status eq "B")
281 {
282 # Don't process this file as it has already been indexed
283 $process_file = 0;
284 }
285 elsif ($index_status eq "D") {
286 # Need to be delete it from the index.
287 $new_mode = $curr_mode."delete";
288 $process_file = 1;
289 }
290 elsif ($index_status eq "R") {
291 # Need to be reindexed/replaced
292 $new_mode = $curr_mode."reindex";
293
294 $process_file = 1;
295 }
296 }
297 # ... or we're being asked to delete it (in which case skip it)
298 elsif ($index_status eq "D") {
299 # Non-incremental Delete
300 # It's already been deleted from the archives directory
301 # (done during import.pl)
302 # => All we need to do here is not process it
303
304 $process_file = 0;
305 }
306
307 if (!$processor->is_incremental_capable() && $self->{'incremental'}) {
308 # Nag feature
309 if (!defined $self->{'incremental-warning'}) {
310 print $outhandle "\n";
311 print $outhandle "Warning: command-line option '-incremental' used with *non-incremental*\n";
312 print $outhandle " processor '", ref $processor, "'. Some conflicts may arise.\n";
313 print $outhandle "\n";
314 sleep 10;
315 $self->{'incremental-warning'} = 1;
316 }
317 }
318
319 if ($process_file) {
320 # note: metadata is not carried on to the next level
321
322 $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
323
324 $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
325
326 $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
327 }
328 }
329
330 return $count;
331 }
332
333
334 # wasn't an archives directory, someone else will have to process it
335 return undef;
336}
337
3381;
Note: See TracBrowser for help on using the repository browser.