source: gsdl/trunk/perllib/inexport.pm@ 20776

Last change on this file since 20776 was 20776, checked in by kjdon, 15 years ago

in the middle of fixing small bugs in incremental build. lots of changes here, not sure what they are all for. One important one - if a metadata file has changed, then we need to reimport all files (same as in metadata file was new).

  • Property svn:executable set to *
File size: 12.9 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30use File::Basename;
31
32use util;
33use GDBMUtils;
34
35
36sub prime_doc_oid_count
37{
38 my ($archivedir) = @_;
39 my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
40
41 if (-e $oid_count_filename) {
42 if (open(OIDIN,"<$oid_count_filename")) {
43 my $OIDcount = <OIDIN>;
44 chomp $OIDcount;
45 close(OIDIN);
46
47 $doc::OIDcount = $OIDcount;
48 }
49 else {
50
51 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
52 print STDERR "Setting value to 0\n";
53 }
54 }
55
56}
57
58sub store_doc_oid_count
59{
60 # Use the file "OIDcount" in the archives directory to record
61 # what value doc.pm got up to
62
63 my ($archivedir) = @_;
64 my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
65
66
67 if (open(OIDOUT,">$oid_count_filename")) {
68 print OIDOUT $doc::OIDcount, "\n";
69
70 close(OIDOUT);
71 }
72 else {
73 print STDERR "Warning: unable to store document OID count\n";
74 }
75}
76
77
78
79sub new_vs_old_import_diff
80{
81 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
82
83 # in this method, we want to know if metadata files are modified or not.
84 my $doc_db = "archiveinf-doc.gdb";
85 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
86
87 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
88
89 # First convert all files to absolute form
90 # This is to support the situation where the import folder is not
91 # the default
92
93 my $prev_all_files = $archive_info->{'prev_import_filelist'};
94 my $full_prev_all_files = {};
95
96 foreach my $prev_file (keys %$prev_all_files) {
97
98 if (!&util::filename_is_absolute($prev_file)) {
99 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
100 $full_prev_all_files->{$full_prev_file} = $prev_file;
101 }
102 else {
103 $full_prev_all_files->{$prev_file} = $prev_file;
104 }
105 }
106
107
108 # Figure out which are the new files, existing files and so
109 # by implication the files from the previous import that are not
110 # there any more => mark them for deletion
111 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
112
113 my $full_curr_file = $curr_file;
114
115 # entry in 'all_files' is moved to either 'existing_files',
116 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
117
118 if (!&util::filename_is_absolute($curr_file)) {
119 # add in import dir to make absolute
120 $full_curr_file = &util::filename_cat($importdir,$curr_file);
121 }
122
123 # figure out if new file or not
124 if (defined $full_prev_all_files->{$full_curr_file}) {
125 # delete it so that only files that need deleting are left
126 delete $full_prev_all_files->{$full_curr_file};
127
128 # had it before. is it a metadata file?
129 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
130
131 # is it modified??
132 if (-M $full_curr_file < $archiveinf_timestamp) {
133 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
134 # its newer than last build
135 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
136 }
137 }
138 else {
139 if ($incremental_mode eq "all") {
140
141 # had it before
142 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
143
144 }
145 else {
146 # Warning in "onlyadd" mode, but had it before!
147 print STDERR "Warning: File $full_curr_file previously imported.\n";
148 print STDERR " Treating as new file\n";
149
150 $block_hash->{'new_files'}->{$full_curr_file} = 1;
151
152 }
153 }
154 }
155 else {
156 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
157 # the new file is the special sort of file greenstone uses
158 # to attach metadata to src documents
159 # i.e metadata.xml
160 # (but note, the filename used is not constrained in
161 # Greenstone to always be this)
162
163 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
164 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
165 }
166 else {
167 $block_hash->{'new_files'}->{$full_curr_file} = 1;
168 }
169 }
170
171
172 delete $block_hash->{'all_files'}->{$curr_file};
173 }
174
175
176 # Deal with complication of new or modified metadata files by forcing
177 # everything from this point down in the file hierarchy to
178 # be freshly imported.
179 #
180 # This may mean files that have not changed are reindexed, but does
181 # guarantee by the end of processing all new metadata is correctly
182 # associated with the relevant document(s).
183
184 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
185 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
186
187 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
188 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
189
190 # Go through existing_files, and mark anything that is contained
191 # within 'situated_dir' to be reindexed (in case some of the metadata
192 # attaches to one of these files)
193
194 my $reindex_files = [];
195
196 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
197
198 if ($existing_f =~ m/^$situated_dir/) {
199 push(@$reindex_files,$existing_f);
200 $block_hash->{'reindex_files'}->{$existing_f} = 1;
201
202 }
203 }
204
205 # Reindexing is accomplished by putting them in the list for reindexing (line above)
206 # and then tagging the arcinfo version as to be deleted.
207
208 _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex");
209
210 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
211 # (or equivalent)
212 $block_hash->{'new_files'}->{$new_mdf} = 1;
213
214 }
215
216
217 # By this point full_prev_all_files contains the files
218 # mentioned in archiveinf-src.db but are not in the 'import'
219 # folder (or whatever was specified through -importdir ...)
220
221 # This list can contain files that were created in the 'tmp' or
222 # 'cache' areas (such as screen-size and thumbnail images).
223 #
224 # In building the final list of files to delete, we test to see if
225 # it exists on the filesystem and if it does (unusual for a "normal"
226 # file in import, but possible in the case of 'tmp' files),
227 # supress it from going into the final list
228
229 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
230
231 my @deleted_files = values %$full_prev_all_files;
232 map { my $curr_file = $_;
233 my $full_curr_file = $curr_file;
234
235 if (!&util::filename_is_absolute($curr_file)) {
236 # add in import dir to make absolute
237
238 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
239 }
240
241
242 if (!-e $full_curr_file) {
243 $block_hash->{'deleted_files'}->{$curr_file} = 1;
244 }
245 } @deleted_files;
246
247
248
249}
250
251
252sub is_assoc_file
253{
254 my ($file,$doc_rec) = @_;
255
256 my ($file_root,$dirname,$suffix) = fileparse($file, "\\.[^\\.]+\$");
257
258 foreach my $af (@{$doc_rec->{'assoc-file'}}) {
259 my $full_af = &util::filename_cat($dirname,$af);
260
261 return 1 if ($full_af eq $file);
262 }
263
264 return 0;
265}
266
267
268sub _mark_docs_for_deletion
269{
270 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode_text) = @_;
271
272 my $doc_db = "archiveinf-doc.gdb";
273 my $src_db = "archiveinf-src.gdb";
274 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
275 my $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db);
276
277
278 # record files marked for deletion in arcinfo
279 foreach my $file (@$deleted_files) {
280 # use 'archiveinf-src' GDBM file to look up all the OIDs
281 # that this file is used in (note in most cases, it's just one OID)
282
283 my $src_rec = GDBMUtils::gdbmRecordToHash($arcinfo_src_filename,$file);
284 my $oids = $src_rec->{'oid'};
285 my $file_record_deleted = 0;
286 foreach my $oid (@$oids) {
287 # Find out if it's a main doc, assoc file, or metadata
288
289 my $doc_rec = GDBMUtils::gdbmRecordToHash($arcinfo_doc_filename,$oid);
290 my $doc_source_file = $doc_rec->{'src-file'}->[0];
291 if (!&util::filename_is_absolute($doc_source_file)) {
292 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
293 }
294
295 if (is_assoc_file($file,$doc_rec)) {
296 ## -- kjdon - here, do same thing as for metadata file??
297 ## mark source for reimport??
298 # assoc file => mark it for re-indexing (safest thing to do)
299 my $curr_status = $archive_info->get_status_info($oid);
300
301 # mark source doc for reimport as one of its assoc files has changed or deleted
302 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
303 if (defined($curr_status) && (($curr_status ne "D") && ($curr_status ne "R"))) {
304 if ($verbosity > 1) {
305 print STDERR "$oid marked for (potential) reindexing\n";
306 print STDERR " (because associated file $file deleted)\n";
307 }
308 $archive_info->set_status_info($oid,"R");
309
310 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
311 $val =~ s/^<index-status>(.*)$/<index-status>R/m;
312 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
313 }
314 GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$file) unless $file_record_deleted;
315 $file_record_deleted = 1;
316 }
317 else {
318 # either src-file or metadata.xml file linking to src-file
319 # actually, metadata files should not get here, as are
320 # processed earlier
321
322 if ($doc_rec->{'src-file'}->[0] ne $file) {
323 # it's a metadata file attached to this OID
324 # => reindex the src-file it matches to
325
326 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
327
328 # remove the metadata file from the src-database
329
330 GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$file) unless $file_record_deleted;
331 $file_record_deleted = 1;
332 }
333
334 # Whether the main file directly or indirectly, mark for deletion/reindex
335
336 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
337 my ($index_status) = ($val =~ m/^<index-status>(.*)$/m);
338
339 if ($index_status ne "D") {
340 if ($verbosity>1) {
341 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
342 }
343 $archive_info->set_status_info($oid,"D");
344
345 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
346 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
347
348 GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$doc_source_file);
349 }
350
351
352 }
353
354 }
355 }
356}
357
358
359sub mark_docs_for_deletion
360{
361 _mark_docs_for_deletion(@_,"deleted from index");
362}
363
364
365sub mark_docs_for_reindex
366{
367 my ($archive_info,$block_hash,$archivedir,$verbosity) = @_;
368
369 # Reindexing is accomplished by deleting the previously indexed
370 # version of the document, and then allowing the new version to
371 # be indexed (as would a new document be indexed).
372 #
373 # The first step (marking for deletion) is implemented by this routine.
374 #
375 # By default in Greenstone a new version of an index will hash to
376 # a new unique OID, and the above strategy of reindex=delete+add
377 # works fine. A special case arises when a persistent OID is
378 # allocated to a document (for instance through a metadata field),
379 # and the second step to reindexing (see XXXX) detects this and
380 # deals with it appropriately.
381
382 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
383
384 my $doc_db = "archiveinf-doc.gdb";
385 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
386
387 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
388
389 my $reindex_files = [];
390
391 foreach my $existing_filename (@existing_files) {
392 if (-M $existing_filename < $archiveinf_timestamp) {
393 # file is newer than last build
394
395 my $existing_file = $existing_filename;
396 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
397
398 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
399 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
400
401 print STDERR "**** Reindexing existing file: $existing_file\n";
402
403 push(@$reindex_files,$existing_file);
404 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
405 }
406
407 }
408
409 _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex");
410
411}
412
413
414
4151;
Note: See TracBrowser for help on using the repository browser.