source: main/trunk/greenstone2/perllib/plugins/ArchivesInfPlugin.pm@ 32462

Last change on this file since 32462 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.7 KB
RevLine 
[537]1###########################################################################
2#
[16013]3# ArchivesInfPlugin.pm --
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[21566]26# plugin which reads through an archives.inf (or archiveinf-doc info database equivalent)
27# -- i.e. the file generated in the archives directory when an import is done),
28# processing each file it finds
[4]29
[15870]30package ArchivesInfPlugin;
[4]31
32use util;
[27306]33use FileUtils;
[18528]34use doc;
[31492]35use CommonUtil;
[4]36use plugin;
37use arcinfo;
[5680]38use gsprintf;
[4]39
[10254]40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
[21563]43
[4]44BEGIN {
[31492]45 @ArchivesInfPlugin::ISA = ('CommonUtil');
[4]46}
47
[10254]48my $arguments = [
[20758]49 { 'name' => "reversesort",
50 'desc' => "{ArchivesInfPlugin.reversesort}",
51 'type' => "flag",
52 'reqd' => "no",
53 'modegli' => "2" },
[27697]54 { 'name' => "sort",
55 'desc' => "{ArchivesInfPlugin.sort}",
56 'type' => "flag",
57 'reqd' => "no",
58 'modegli' => "2" }
[20758]59
[10254]60 ];
61
[15870]62my $options = { 'name' => "ArchivesInfPlugin",
63 'desc' => "{ArchivesInfPlugin.desc}",
[6408]64 'abstract' => "no",
[20760]65 'inherits' => "yes",
66 'args' => $arguments};
[10254]67
[5680]68sub gsprintf
69{
70 return &gsprintf::gsprintf(@_);
71}
72
[4]73sub new {
[10218]74 my ($class) = shift (@_);
75 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
76 push(@$pluginlist, $class);
[4]77
[15870]78 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
79 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]80
[31492]81 my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists);
[10218]82
[4]83 return bless $self, $class;
84}
85
[17738]86# called once, at the start of processing
87sub init {
88 my $self = shift (@_);
89 my ($verbosity, $outhandle, $failhandle) = @_;
90
91 # verbosity is passed through from the processor
92 $self->{'verbosity'} = $verbosity;
93
94 # as are the outhandle and failhandle
95 $self->{'outhandle'} = $outhandle if defined $outhandle;
96 $self->{'failhandle'} = $failhandle;
97
98}
99
[10156]100sub deinit {
101 my ($self) = @_;
102
103 my $archive_info = $self->{'archive_info'};
[18508]104 my $verbosity = $self->{'verbosity'};
105 my $outhandle = $self->{'outhandle'};
[10156]106
107 if (defined $archive_info) {
[21614]108 # Get the infodbtype value for this collection from the arcinfo object
109 my $infodbtype = $archive_info->{'infodbtype'};
[10156]110 my $archive_info_filename = $self->{'archive_info_filename'};
[21614]111 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $archive_info_filename, "append");
[10156]112
113 my $file_list = $archive_info->get_file_list();
114
[18456]115 foreach my $subfile (@$file_list) {
[10156]116 my $doc_oid = $subfile->[1];
[18441]117
[10254]118 my $index_status = $archive_info->get_status_info($doc_oid);
[18456]119
[18441]120 if ($index_status eq "D") {
121 # delete
122 $archive_info->delete_info($doc_oid);
[21614]123 &dbutil::delete_infodb_entry($infodbtype, $infodb_file_handle, $doc_oid);
[18508]124
125 my $doc_file = $subfile->[0];
126 my $base_dir =$self->{'base_dir'};
127
[27306]128 my $doc_filename = &FileUtils::filenameConcatenate($base_dir,$doc_file);
[18508]129
130 my ($doc_tailname, $doc_dirname, $suffix)
131 = File::Basename::fileparse($doc_filename, "\\.[^\\.]+\$");
132
[18509]133 print $outhandle "Removing $doc_dirname\n" if ($verbosity>2);
[18508]134
[28563]135 &FileUtils::removeFilesRecursive($doc_dirname);
[18508]136
137
[18441]138 }
139 elsif ($index_status =~ m/^(I|R)$/) {
140 # mark as "been indexed"
141 $archive_info->set_status_info($doc_oid,"B");
142 }
[10156]143 }
144
[21614]145 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
[10156]146 $archive_info->save_info($archive_info_filename);
147 }
148}
149
[17738]150# called at the beginning of each plugin pass (import has one, buildin has many)
151sub begin {
152 my $self = shift (@_);
153 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
154
[18508]155 $self->{'base_dir'} = $base_dir;
[17738]156}
157
[21308]158sub remove_all {
[21285]159 my $self = shift (@_);
160 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
161}
[21308]162
163sub remove_one {
164 my $self = shift (@_);
[21315]165 my ($file, $oids, $archivedir) = @_;
166 return undef; # only called during import at this stage, this will never be processing a file
167
[21308]168}
169
170
[17738]171# called at the end of each plugin pass
172sub end {
173 my ($self) = shift (@_);
174
175}
176
177
[4]178# return 1 if this class might recurse using $pluginfo
179sub is_recursive {
180 my $self = shift (@_);
181
182 return 1;
183}
184
[10156]185
[17738]186sub compile_stats {
187 my $self = shift(@_);
188 my ($stats) = @_;
189}
[10156]190
[17738]191# We don't do metadata_read
192sub metadata_read {
193 my $self = shift (@_);
[19493]194 my ($pluginfo, $base_dir, $file, $block_hash,
195 $extrametakeys, $extrametadata, $extrametafile,
[23212]196 $processor, $gli, $aux) = @_;
[10156]197
[17738]198 return undef;
199}
200
201sub file_block_read {
202
203 my $self = shift (@_);
204 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
205
[18528]206 if ($file eq "OIDcount") {
207 my ($filename_full_path, $filename_no_path)
208 = &util::get_full_filenames($base_dir, $file);
[31479]209 $self->block_raw_filename($block_hash,$filename_full_path);
[18528]210 return 1;
211 }
212
213 # otherwise, we don't do any file blocking
214
[17738]215 return undef;
216}
217
218
[317]219# return number of files processed, undef if can't process
[4]220# Note that $base_dir might be "" and that $file might
221# include directories
222sub read {
223 my $self = shift (@_);
[16392]224 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count, $gli) = @_;
[1424]225 my $outhandle = $self->{'outhandle'};
[4]226
[317]227 my $count = 0;
228
[21614]229 # This function only makes sense at build-time
[24351]230 return if (ref($processor) !~ /buildproc$/i);
[21614]231
232 # Get the infodbtype value for this collection from the buildproc object
233 my $infodbtype = $processor->{'infodbtype'};
[23171]234 $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz";
235
[4]236 # see if this has a archives information file within it
[27306]237## my $archive_info_filename = &FileUtils::filenameConcatenate($base_dir,$file,"archives.inf");
238 my $archive_info_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", &FileUtils::filenameConcatenate($base_dir, $file));
[18659]239
[4]240 if (-e $archive_info_filename) {
241
[317]242 # found an archives.inf file
[15870]243 &gsprintf($outhandle, "ArchivesInfPlugin: {common.processing} $archive_info_filename\n") if $self->{'verbosity'} > 1;
[317]244
[4]245 # read in the archives information file
[21614]246 my $archive_info = new arcinfo($infodbtype);
[10156]247 $self->{'archive_info'} = $archive_info;
[12397]248 $self->{'archive_info_filename'} = $archive_info_filename;
[20758]249 if ($self->{'reversesort'}) {
250 $archive_info->reverse_sort();
[27697]251 } elsif ($self->{'sort'}) {
252 $archive_info->sort();
[20758]253 }
254
[4]255 $archive_info->load_info ($archive_info_filename);
256
[230]257 my $file_list = $archive_info->get_file_list();
[4]258
259 # process each file
[1244]260 foreach my $subfile (@$file_list) {
[18441]261
[9853]262 last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs);
[317]263
[27306]264 my $tmp = &FileUtils::filenameConcatenate($file, $subfile->[0]);
[4]265 next if $tmp eq $file;
[18456]266
267 my $doc_oid = $subfile->[1];
268 my $index_status = $archive_info->get_status_info($doc_oid);
269
270 my $curr_mode = $processor->get_mode();
271 my $new_mode = $curr_mode;
[28638]272 my $group_position = $archive_info->get_group_position($doc_oid);
[18456]273
274 # Start by assuming we want to process the file...
[16257]275 my $process_file = 1;
[10156]276
[28638]277 # ... unless we have processed files into a group doc.xml, in which case we only process the xml for the first one
278 if (defined $group_position && $group_position >1) {
279 $process_file = 0;
280 }
[18469]281 # ...unless the build processor is incremental capable and -incremental was specified, in which case we need to check its index_status flag
[28638]282 elsif ($processor->is_incremental_capable() && $self->{'incremental'})
[16257]283 {
[18441]284 # Check to see if the file needs indexing
[16257]285 if ($index_status eq "B")
286 {
[18441]287 # Don't process this file as it has already been indexed
[16257]288 $process_file = 0;
[10305]289 }
[18456]290 elsif ($index_status eq "D") {
291 # Need to be delete it from the index.
292 $new_mode = $curr_mode."delete";
293 $process_file = 1;
294 }
295 elsif ($index_status eq "R") {
[18469]296 # Need to be reindexed/replaced
[18456]297 $new_mode = $curr_mode."reindex";
[18469]298
[18456]299 $process_file = 1;
300 }
[10305]301 }
[18456]302 # ... or we're being asked to delete it (in which case skip it)
303 elsif ($index_status eq "D") {
[18469]304 # Non-incremental Delete
305 # It's already been deleted from the archives directory
306 # (done during import.pl)
307 # => All we need to do here is not process it
[10305]308
[18456]309 $process_file = 0;
310 }
311
[18469]312 if (!$processor->is_incremental_capable() && $self->{'incremental'}) {
313 # Nag feature
314 if (!defined $self->{'incremental-warning'}) {
315 print $outhandle "\n";
316 print $outhandle "Warning: command-line option '-incremental' used with *non-incremental*\n";
317 print $outhandle " processor '", ref $processor, "'. Some conflicts may arise.\n";
318 print $outhandle "\n";
319 sleep 10;
320 $self->{'incremental-warning'} = 1;
321 }
322 }
323
[10305]324 if ($process_file) {
[10156]325 # note: metadata is not carried on to the next level
[18456]326
327 $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
328
[16392]329 $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
[18456]330
331 $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
[10156]332 }
[4]333 }
334
[317]335 return $count;
[4]336 }
337
[18528]338
[4]339 # wasn't an archives directory, someone else will have to process it
[317]340 return undef;
[4]341}
342
3431;
Note: See TracBrowser for help on using the repository browser.