source: gsdl/trunk/perllib/inexport.pm@ 18528

Last change on this file since 18528 was 18528, checked in by davidb, 12 years ago

OIDmetadata wasn't supported in collect.cfg, but OIDtype was. Now rectified. Also introduced OIDcount as a file saved in the archives folder to help doc.pm use the correct value when working incrementally

  • Property svn:executable set to *
File size: 8.7 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30use File::Basename;
31
32use util;
33use GDBMUtils;
34
35
36sub prime_doc_oid_count
37{
38 my ($archivedir) = @_;
39 my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
40
41 if (-e $oid_count_filename) {
42 if (open(OIDIN,"<$oid_count_filename")) {
43 my $OIDcount = <OIDIN>;
44 chomp $OIDcount;
45 close(OIDIN);
46
47 $doc::OIDcount = $OIDcount;
48 }
49 else {
50
51 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
52 print STDERR "Setting value to 0\n";
53 }
54 }
55
56}
57
58sub store_doc_oid_count
59{
60 # Use the file "OIDcount" in the archives directory to record
61 # what value doc.pm got up to
62
63 my ($archivedir) = @_;
64 my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
65
66
67 if (open(OIDOUT,">$oid_count_filename")) {
68 print OIDOUT $doc::OIDcount, "\n";
69
70 close(OIDOUT);
71 }
72 else {
73 print STDERR "Warning: unable to store document OID count\n";
74 }
75}
76
77
78
79sub new_vs_old_import_diff
80{
81 my ($archive_info,$block_hash,$importdir) = @_;
82
83 # First convert all files to absolute form
84 # This is to support the situation where the import folder is not
85 # the default
86
87 my $prev_all_files = $archive_info->{'prev_import_filelist'};
88 my $full_prev_all_files = {};
89
90 foreach my $prev_file (keys %$prev_all_files) {
91
92 if (!&util::filename_is_absolute($prev_file)) {
93 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
94 $full_prev_all_files->{$full_prev_file} = $prev_file;
95 }
96 else {
97 $full_prev_all_files->{$prev_file} = $prev_file;
98 }
99 }
100
101
102 # Figure out which are the new files, existing files and so
103 # by implication the files from the previous import that are not
104 # there any more => mark them for deletion
105 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
106
107 my $full_curr_file = $curr_file;
108
109 # entry in 'all_files' is moved to either 'existing_files',
110 # 'deleted_files', or 'new_files'
111
112 if (!&util::filename_is_absolute($curr_file)) {
113 # add in import dir to make absolute
114 $full_curr_file = &util::filename_cat($importdir,$curr_file);
115 }
116
117 if (defined $block_hash->{'file_blocks'}->{$full_curr_file}) {
118 # If in block list, we want to ignore it
119 delete $block_hash->{'all_files'}->{$curr_file};
120
121 if (defined $full_prev_all_files->{$full_curr_file}) {
122 # also make sure it is gone from 'previous' list so
123 # not mistaken for a file that needs to be deleted
124 delete $full_prev_all_files->{$full_curr_file};
125 }
126 next;
127 }
128
129 # figure of if new file or not
130 if (defined $full_prev_all_files->{$full_curr_file}) {
131
132 # had it before
133 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
134
135 # Now remove it, so by end of loop only the files
136 # that need deleting are left
137
138 delete $full_prev_all_files->{$full_curr_file};
139 }
140 else {
141 $block_hash->{'new_files'}->{$full_curr_file} = 1;
142 }
143
144 delete $block_hash->{'all_files'}->{$curr_file};
145 }
146
147 # By this point full_prev_all_files contains the files
148 # mentioned in archiveinf-src.db but are not in the 'import'
149 # folder (or whatever was specified through -importdir ...)
150
151 # This list can contain files that were created in the 'tmp' or
152 # 'cache' areas (such as screen-size and thumbnail images).
153 #
154 # In building the final list of files to delete, we test to see if
155 # it exists on the filesystem and if it does (unusual for a file
156 # that's allegedly deleted!) , supress it from going into the final
157 # list
158
159 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
160
161 my @deleted_files = values %$full_prev_all_files;
162 map { my $curr_file = $_;
163 my $full_curr_file = $curr_file;
164
165 if (!&util::filename_is_absolute($curr_file)) {
166 # add in import dir to make absolute
167
168 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
169 }
170
171
172 if (!-e $full_curr_file) {
173 $block_hash->{'deleted_files'}->{$curr_file} = 1;
174 }
175 } @deleted_files;
176}
177
178sub mark_docs_for_deletion
179{
180 my ($archive_info,$deleted_files_ref,$archivedir,$verbosity) = @_;
181
182 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
183 my $doc_db = "archiveinf-doc$db_ext";
184 my $src_db = "archiveinf-src$db_ext";
185 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
186 my $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db);
187
188
189 # record files marked for deletion in arcinfo
190 foreach my $file (@$deleted_files_ref) {
191 # use 'archiveinf-src' GDBM file to look up all the OIDs
192 # this file is used in (note in most cases, it's just one OID)
193
194 my $src_rec = GDBMUtils::gdbmRecordToHash($arcinfo_src_filename,$file);
195 my $oids = $src_rec->{'oid'};
196 foreach my $oid (@$oids) {
197
198 # Find out if it's an assoc file or main doc
199
200 my $doc_rec = GDBMUtils::gdbmRecordToHash($arcinfo_doc_filename,$oid);
201 if ($doc_rec->{'src-file'}->[0] eq $file) {
202 # It's the main doc
203 # => mark it for deletion
204
205 if ($verbosity>1) {
206 print STDERR "$oid ($file) marked to be deleted from index on next buildcol.pl\n";
207 }
208 $archive_info->set_status_info($oid,"D");
209
210 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
211 my ($index_status) = ($val =~ m/^<index-status>(.*)$/m);
212 if ($index_status ne "D") {
213 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
214 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
215 }
216 }
217 else {
218 # assoc file => mark it for re-indexing (safest thing to do)
219 my $curr_status = $archive_info->get_status_info($oid);
220
221
222 if (defined($curr_status) && (($curr_status ne "D") && ($curr_status ne "R"))) {
223 if ($verbosity > 1) {
224 print STDERR "$oid marked for (potential) reindexing\n";
225 print STDERR " (because associated file $file deleted)\n";
226 }
227 $archive_info->set_status_info($oid,"R");
228
229 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
230 $val =~ s/^<index-status>(.*)$/<index-status>R/m;
231 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
232 }
233 }
234
235 GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$file);
236 }
237 }
238}
239
240
241
242sub mark_docs_for_reindex
243{
244 my ($archive_info,$existing_files_ref,$archivedir,$verbosity) = @_;
245
246 # Reindexing is accomplished by deleting the previously indexed
247 # version of the document, and then allowing the new version to
248 # be indexed (as would a new document be indexed).
249 #
250 # The first step (marking for deletion) is implemented by this routine.
251 #
252 # By default in Greenstone a new version of an index will hash to
253 # a new unique OID, and the above strategy of reindex=delete+add
254 # works fine. A special case arises when a persistent OID is
255 # allocated to a document (for instance through a metadata field),
256 # and the second step to reindexing (see XXXX) detects this and
257 # deals with it appropriately.
258
259 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
260 my $doc_db = "archiveinf-doc$db_ext";
261 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
262
263
264 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
265
266 my $reindex_files_ref = [];
267
268 foreach my $existing_filename (@$existing_files_ref) {
269
270 if (-M $existing_filename < $archiveinf_timestamp) {
271 # file is newer than last build
272
273 my $existing_file = $existing_filename;
274 my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
275
276 $existing_file =~ s/^$collectdir(\\|\/)?//;
277
278 print STDERR "**** Deleting existing file: $existing_file\n";
279
280 push(@$reindex_files_ref,$existing_file);
281 }
282
283 }
284
285 mark_docs_for_deletion($archive_info,$reindex_files_ref,$archivedir,$verbosity);
286
287 return @$reindex_files_ref;
288}
289
290
291
292
2931;
Note: See TracBrowser for help on using the repository browser.