source: gsdl/trunk/perllib/inexport.pm@ 19617

Last change on this file since 19617 was 19498, checked in by davidb, 15 years ago

Supporting routines that exploit the new 'metafiles' structures, introduction to track which metadata.xml file a piece of metadata came from

  • Property svn:executable set to *
File size: 9.7 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30use util;
31use GDBMUtils;
32
33
34sub prime_doc_oid_count
35{
36 my ($archivedir) = @_;
37 my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
38
39 if (-e $oid_count_filename) {
40 if (open(OIDIN,"<$oid_count_filename")) {
41 my $OIDcount = <OIDIN>;
42 chomp $OIDcount;
43 close(OIDIN);
44
45 $doc::OIDcount = $OIDcount;
46 }
47 else {
48
49 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
50 print STDERR "Setting value to 0\n";
51 }
52 }
53
54}
55
56sub store_doc_oid_count
57{
58 # Use the file "OIDcount" in the archives directory to record
59 # what value doc.pm got up to
60
61 my ($archivedir) = @_;
62 my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
63
64
65 if (open(OIDOUT,">$oid_count_filename")) {
66 print OIDOUT $doc::OIDcount, "\n";
67
68 close(OIDOUT);
69 }
70 else {
71 print STDERR "Warning: unable to store document OID count\n";
72 }
73}
74
75
76
77sub new_vs_old_import_diff
78{
79 my ($archive_info,$block_hash,$importdir) = @_;
80
81 # First convert all files to absolute form
82 # This is to support the situation where the import folder is not
83 # the default
84
85 my $prev_all_files = $archive_info->{'prev_import_filelist'};
86 my $full_prev_all_files = {};
87
88 foreach my $prev_file (keys %$prev_all_files) {
89
90 if (!&util::filename_is_absolute($prev_file)) {
91 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
92 $full_prev_all_files->{$full_prev_file} = $prev_file;
93 }
94 else {
95 $full_prev_all_files->{$prev_file} = $prev_file;
96 }
97 }
98
99
100 # Figure out which are the new files, existing files and so
101 # by implication the files from the previous import that are not
102 # there any more => mark them for deletion
103 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
104
105 my $full_curr_file = $curr_file;
106
107 # entry in 'all_files' is moved to either 'existing_files',
108 # 'deleted_files', or 'new_files'
109
110 if (!&util::filename_is_absolute($curr_file)) {
111 # add in import dir to make absolute
112 $full_curr_file = &util::filename_cat($importdir,$curr_file);
113 }
114
115 if (defined $block_hash->{'file_blocks'}->{$full_curr_file}) {
116 # If in block list, we want to ignore it
117 delete $block_hash->{'all_files'}->{$curr_file};
118
119 if (defined $full_prev_all_files->{$full_curr_file}) {
120 # also make sure it is gone from 'previous' list so
121 # not mistaken for a file that needs to be deleted
122 delete $full_prev_all_files->{$full_curr_file};
123 }
124 next;
125 }
126
127 # figure out if new file or not
128 if (defined $full_prev_all_files->{$full_curr_file}) {
129
130 # had it before
131 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
132
133 # Now remove it, so by end of loop only the files
134 # that need deleting are left
135
136 delete $full_prev_all_files->{$full_curr_file};
137 }
138 else {
139 $block_hash->{'new_files'}->{$full_curr_file} = 1;
140 }
141
142 delete $block_hash->{'all_files'}->{$curr_file};
143 }
144
145 # By this point full_prev_all_files contains the files
146 # mentioned in archiveinf-src.db but are not in the 'import'
147 # folder (or whatever was specified through -importdir ...)
148
149 # This list can contain files that were created in the 'tmp' or
150 # 'cache' areas (such as screen-size and thumbnail images).
151 #
152 # In building the final list of files to delete, we test to see if
153 # it exists on the filesystem and if it does (unusual for a file
154 # that's allegedly deleted!), supress it from going into the final
155 # list
156
157 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
158
159 my @deleted_files = values %$full_prev_all_files;
160 map { my $curr_file = $_;
161 my $full_curr_file = $curr_file;
162
163 if (!&util::filename_is_absolute($curr_file)) {
164 # add in import dir to make absolute
165
166 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
167 }
168
169
170 if (!-e $full_curr_file) {
171 $block_hash->{'deleted_files'}->{$curr_file} = 1;
172 }
173 } @deleted_files;
174}
175
176
177sub is_assoc_file
178{
179 my ($file,$doc_rec) = @_;
180
181 foreach my $af (@{$doc_rec->{'assoc-file'}}) {
182 return 1 if ($af eq $file);
183 }
184
185 return 0;
186}
187
188
189sub _mark_docs_for_deletion
190{
191 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode_text) = @_;
192
193 my $doc_db = "archiveinf-doc.gdb";
194 my $src_db = "archiveinf-src.gdb";
195 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
196 my $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db);
197
198
199 # record files marked for deletion in arcinfo
200 foreach my $file (@$deleted_files) {
201 # use 'archiveinf-src' GDBM file to look up all the OIDs
202 # this file is used in (note in most cases, it's just one OID)
203
204 my $src_rec = GDBMUtils::gdbmRecordToHash($arcinfo_src_filename,$file);
205 my $oids = $src_rec->{'oid'};
206 foreach my $oid (@$oids) {
207
208 # Find out if it's a main doc, assoc file, or metadata
209
210 my $doc_rec = GDBMUtils::gdbmRecordToHash($arcinfo_doc_filename,$oid);
211
212 if (is_assoc_file($file,$doc_rec)) {
213 # assoc file => mark it for re-indexing (safest thing to do)
214 my $curr_status = $archive_info->get_status_info($oid);
215
216
217 if (defined($curr_status) && (($curr_status ne "D") && ($curr_status ne "R"))) {
218 if ($verbosity > 1) {
219 print STDERR "$oid marked for (potential) reindexing\n";
220 print STDERR " (because associated file $file deleted)\n";
221 }
222 $archive_info->set_status_info($oid,"R");
223
224 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
225 $val =~ s/^<index-status>(.*)$/<index-status>R/m;
226 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
227 }
228 GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$file);
229 }
230 else {
231 # either src-file or metadata.xml file linking to src-file
232
233 my $src_file;
234
235 if ($doc_rec->{'src-file'}->[0] ne $file) {
236 # it's a metadata file attached to this OID
237 # => workout the src-file it matches to
238
239 $src_file = $doc_rec->{'src-file'}->[0];
240
241 my $src_filename = $src_file;
242 if (!&util::filename_is_absolute($src_file)) {
243 $src_filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$src_file);
244 }
245
246 $block_hash->{'reindex_files'}->{$src_filename} = 1;
247 }
248 else {
249 # It's the main doc
250 # => make it the target and mark it for deletion
251 $src_file = $file;
252 }
253
254 # Whether the main file directly or indirectly, mark for deletion/reindex
255
256 if ($verbosity>1) {
257 print STDERR "$oid ($src_file) marked to be $mode_text on next buildcol.pl\n";
258 }
259 $archive_info->set_status_info($oid,"D");
260
261 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
262 my ($index_status) = ($val =~ m/^<index-status>(.*)$/m);
263 if ($index_status ne "D") {
264 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
265 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
266 }
267
268 GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$src_file);
269 }
270
271 }
272 }
273}
274
275
276sub mark_docs_for_deletion
277{
278 _mark_docs_for_deletion(@_,"deleted from index");
279}
280
281
282sub mark_docs_for_reindex
283{
284 my ($archive_info,$block_hash,$archivedir,$verbosity) = @_;
285
286 # Reindexing is accomplished by deleting the previously indexed
287 # version of the document, and then allowing the new version to
288 # be indexed (as would a new document be indexed).
289 #
290 # The first step (marking for deletion) is implemented by this routine.
291 #
292 # By default in Greenstone a new version of an index will hash to
293 # a new unique OID, and the above strategy of reindex=delete+add
294 # works fine. A special case arises when a persistent OID is
295 # allocated to a document (for instance through a metadata field),
296 # and the second step to reindexing (see XXXX) detects this and
297 # deals with it appropriately.
298
299 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
300
301 my $doc_db = "archiveinf-doc.gdb";
302 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
303
304 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
305
306 my $reindex_files = [];
307
308 foreach my $existing_filename (@existing_files) {
309
310 if (-M $existing_filename < $archiveinf_timestamp) {
311 # file is newer than last build
312
313 my $existing_file = $existing_filename;
314 my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
315
316 my $collectdir_resafe = &util::filename_to_regex($collectdir);
317 $existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
318
319### print STDERR "**** Deleting existing file: $existing_file\n";
320
321 push(@$reindex_files,$existing_file);
322 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
323 }
324
325 }
326
327 _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex");
328
329}
330
331
332
3331;
Note: See TracBrowser for help on using the repository browser.