source: main/trunk/greenstone2/perllib/inexport.pm@ 21620

Last change on this file since 21620 was 21620, checked in by mdewsnip, 14 years ago

Changed inexport.pm to use the infodbtype value from the collect.cfg file instead of hard-wiring GDBM. Part of making the code less GDBM-specific.

  • Property svn:executable set to *
File size: 11.3 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30use File::Basename;
31
32use dbutil;
33use util;
34
35
36sub oid_count_file {
37 my ($archivedir) = @_;
38 return &util::filename_cat ($archivedir, "OIDcount");
39}
40
41
42sub prime_doc_oid_count
43{
44 my ($archivedir) = @_;
45 my $oid_count_filename = &oid_count_file($archivedir);
46
47 if (-e $oid_count_filename) {
48 if (open(OIDIN,"<$oid_count_filename")) {
49 my $OIDcount = <OIDIN>;
50 chomp $OIDcount;
51 close(OIDIN);
52
53 $doc::OIDcount = $OIDcount;
54 }
55 else {
56
57 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
58 print STDERR "Setting value to 0\n";
59 }
60 }
61
62}
63
64sub store_doc_oid_count
65{
66 # Use the file "OIDcount" in the archives directory to record
67 # what value doc.pm got up to
68
69 my ($archivedir) = @_;
70 my $oid_count_filename = &oid_count_file($archivedir);
71
72
73 if (open(OIDOUT,">$oid_count_filename")) {
74 print OIDOUT $doc::OIDcount, "\n";
75
76 close(OIDOUT);
77 }
78 else {
79 print STDERR "Warning: unable to store document OID count\n";
80 }
81}
82
83
84
85sub new_vs_old_import_diff
86{
87 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
88
89 # Get the infodbtype value for this collection from the arcinfo object
90 my $infodbtype = $archive_info->{'infodbtype'};
91
92 # in this method, we want to know if metadata files are modified or not.
93 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
94
95 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
96
97 # First convert all files to absolute form
98 # This is to support the situation where the import folder is not
99 # the default
100
101 my $prev_all_files = $archive_info->{'prev_import_filelist'};
102 my $full_prev_all_files = {};
103
104 foreach my $prev_file (keys %$prev_all_files) {
105
106 if (!&util::filename_is_absolute($prev_file)) {
107 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
108 $full_prev_all_files->{$full_prev_file} = $prev_file;
109 }
110 else {
111 $full_prev_all_files->{$prev_file} = $prev_file;
112 }
113 }
114
115
116 # Figure out which are the new files, existing files and so
117 # by implication the files from the previous import that are not
118 # there any more => mark them for deletion
119 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
120
121 my $full_curr_file = $curr_file;
122
123 # entry in 'all_files' is moved to either 'existing_files',
124 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
125
126 if (!&util::filename_is_absolute($curr_file)) {
127 # add in import dir to make absolute
128 $full_curr_file = &util::filename_cat($importdir,$curr_file);
129 }
130
131 # figure out if new file or not
132 if (defined $full_prev_all_files->{$full_curr_file}) {
133 # delete it so that only files that need deleting are left
134 delete $full_prev_all_files->{$full_curr_file};
135
136 # had it before. is it a metadata file?
137 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
138
139 # is it modified??
140 if (-M $full_curr_file < $archiveinf_timestamp) {
141 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
142 # its newer than last build
143 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
144 }
145 }
146 else {
147 if ($incremental_mode eq "all") {
148
149 # had it before
150 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
151
152 }
153 else {
154 # Warning in "onlyadd" mode, but had it before!
155 print STDERR "Warning: File $full_curr_file previously imported.\n";
156 print STDERR " Treating as new file\n";
157
158 $block_hash->{'new_files'}->{$full_curr_file} = 1;
159
160 }
161 }
162 }
163 else {
164 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
165 # the new file is the special sort of file greenstone uses
166 # to attach metadata to src documents
167 # i.e metadata.xml
168 # (but note, the filename used is not constrained in
169 # Greenstone to always be this)
170
171 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
172 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
173 }
174 else {
175 $block_hash->{'new_files'}->{$full_curr_file} = 1;
176 }
177 }
178
179
180 delete $block_hash->{'all_files'}->{$curr_file};
181 }
182
183
184
185
186 # Deal with complication of new or modified metadata files by forcing
187 # everything from this point down in the file hierarchy to
188 # be freshly imported.
189 #
190 # This may mean files that have not changed are reindexed, but does
191 # guarantee by the end of processing all new metadata is correctly
192 # associated with the relevant document(s).
193
194 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
195 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
196
197 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
198 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
199
200 # Go through existing_files, and mark anything that is contained
201 # within 'situated_dir' to be reindexed (in case some of the metadata
202 # attaches to one of these files)
203
204 my $reindex_files = [];
205
206 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
207
208 if ($existing_f =~ m/^$situated_dir/) {
209 push(@$reindex_files,$existing_f);
210 $block_hash->{'reindex_files'}->{$existing_f} = 1;
211 delete $block_hash->{'existing_files'}->{$existing_f};
212
213 }
214 }
215
216 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
217 # (or equivalent)
218 $block_hash->{'new_files'}->{$new_mdf} = 1;
219
220 }
221
222 # go through remaining existing files and work out what has changed and needs to be reindexed.
223 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
224
225 my $reindex_files = [];
226
227 foreach my $existing_filename (@existing_files) {
228 if (-M $existing_filename < $archiveinf_timestamp) {
229 # file is newer than last build
230
231 my $existing_file = $existing_filename;
232 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
233
234 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
235 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
236
237 print STDERR "**** Reindexing existing file: $existing_file\n";
238
239 push(@$reindex_files,$existing_file);
240 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
241 }
242
243 }
244
245
246 # By this point full_prev_all_files contains the files
247 # mentioned in archiveinf-src.db but are not in the 'import'
248 # folder (or whatever was specified through -importdir ...)
249
250 # This list can contain files that were created in the 'tmp' or
251 # 'cache' areas (such as screen-size and thumbnail images).
252 #
253 # In building the final list of files to delete, we test to see if
254 # it exists on the filesystem and if it does (unusual for a "normal"
255 # file in import, but possible in the case of 'tmp' files),
256 # supress it from going into the final list
257
258 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
259
260 my @deleted_files = values %$full_prev_all_files;
261 map { my $curr_file = $_;
262 my $full_curr_file = $curr_file;
263
264 if (!&util::filename_is_absolute($curr_file)) {
265 # add in import dir to make absolute
266
267 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
268 }
269
270
271 if (!-e $full_curr_file) {
272 $block_hash->{'deleted_files'}->{$curr_file} = 1;
273 }
274 } @deleted_files;
275
276
277
278}
279
280
281# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
282# $mode is 'delete' or 'reindex'
283sub mark_docs_for_deletion
284{
285 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
286
287 my $mode_text = "deleted from index";
288 if ($mode eq "reindex") {
289 $mode_text = "reindexed";
290 }
291
292 # Get the infodbtype value for this collection from the arcinfo object
293 my $infodbtype = $archive_info->{'infodbtype'};
294
295 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
296 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
297 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
298 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
299
300 # record files marked for deletion in arcinfo
301 foreach my $file (@$deleted_files) {
302 # use 'archiveinf-src' info database file to look up all the OIDs
303 # that this file is used in (note in most cases, it's just one OID)
304
305 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
306 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
307 my $oids = $src_rec->{'oid'};
308 my $file_record_deleted = 0;
309
310 # delete the src record
311 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
312
313 foreach my $oid (@$oids) {
314
315 # find the source doc (the primary file that becomes this oid)
316 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
317 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
318 my $doc_source_file = $doc_rec->{'src-file'}->[0];
319 if (!&util::filename_is_absolute($doc_source_file)) {
320 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
321 }
322
323 if ($doc_source_file ne $file) {
324 # its an associated or metadata file
325
326 # mark source doc for reimport as one of its assoc files has changed or deleted
327 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
328
329 }
330 my $curr_status = $archive_info->get_status_info($oid);
331 if (defined($curr_status) && (($curr_status ne "D"))) {
332 if ($verbosity>1) {
333 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
334 }
335 # mark oid for deletion (it will be deleted or reimported)
336 $archive_info->set_status_info($oid,"D");
337 my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
338 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
339
340 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
341 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
342 }
343 }
344 }
345
346 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
347 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
348}
349
350
351
3521;
Note: See TracBrowser for help on using the repository browser.