source: main/trunk/greenstone2/perllib/inexport.pm@ 21564

Last change on this file since 21564 was 21564, checked in by mdewsnip, 14 years ago

Changed lots of occurrences of "GDBM" in comments, variable names and function names, where the code isn't GDBM-specific. Part of making the code less GDBM-specific.

  • Property svn:executable set to *
File size: 11.1 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30use File::Basename;
31
32use dbutil;
33use util;
34
35
36sub src_db_file {
37 my ($archivedir) = @_;
38 return &util::filename_cat ($archivedir, "archiveinf-src.gdb");
39}
40
41sub doc_db_file {
42 my ($archivedir) = @_;
43 return &util::filename_cat ($archivedir, "archiveinf-doc.gdb");
44}
45
46sub oid_count_file {
47 my ($archivedir) = @_;
48 return &util::filename_cat ($archivedir, "OIDcount");
49}
50
51
52sub prime_doc_oid_count
53{
54 my ($archivedir) = @_;
55 my $oid_count_filename = &oid_count_file($archivedir);
56
57 if (-e $oid_count_filename) {
58 if (open(OIDIN,"<$oid_count_filename")) {
59 my $OIDcount = <OIDIN>;
60 chomp $OIDcount;
61 close(OIDIN);
62
63 $doc::OIDcount = $OIDcount;
64 }
65 else {
66
67 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
68 print STDERR "Setting value to 0\n";
69 }
70 }
71
72}
73
74sub store_doc_oid_count
75{
76 # Use the file "OIDcount" in the archives directory to record
77 # what value doc.pm got up to
78
79 my ($archivedir) = @_;
80 my $oid_count_filename = &oid_count_file($archivedir);
81
82
83 if (open(OIDOUT,">$oid_count_filename")) {
84 print OIDOUT $doc::OIDcount, "\n";
85
86 close(OIDOUT);
87 }
88 else {
89 print STDERR "Warning: unable to store document OID count\n";
90 }
91}
92
93
94
95sub new_vs_old_import_diff
96{
97 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
98
99 # in this method, we want to know if metadata files are modified or not.
100 my $arcinfo_doc_filename = &doc_db_file($archivedir);
101
102 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
103
104 # First convert all files to absolute form
105 # This is to support the situation where the import folder is not
106 # the default
107
108 my $prev_all_files = $archive_info->{'prev_import_filelist'};
109 my $full_prev_all_files = {};
110
111 foreach my $prev_file (keys %$prev_all_files) {
112
113 if (!&util::filename_is_absolute($prev_file)) {
114 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
115 $full_prev_all_files->{$full_prev_file} = $prev_file;
116 }
117 else {
118 $full_prev_all_files->{$prev_file} = $prev_file;
119 }
120 }
121
122
123 # Figure out which are the new files, existing files and so
124 # by implication the files from the previous import that are not
125 # there any more => mark them for deletion
126 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
127
128 my $full_curr_file = $curr_file;
129
130 # entry in 'all_files' is moved to either 'existing_files',
131 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
132
133 if (!&util::filename_is_absolute($curr_file)) {
134 # add in import dir to make absolute
135 $full_curr_file = &util::filename_cat($importdir,$curr_file);
136 }
137
138 # figure out if new file or not
139 if (defined $full_prev_all_files->{$full_curr_file}) {
140 # delete it so that only files that need deleting are left
141 delete $full_prev_all_files->{$full_curr_file};
142
143 # had it before. is it a metadata file?
144 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
145
146 # is it modified??
147 if (-M $full_curr_file < $archiveinf_timestamp) {
148 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
149 # its newer than last build
150 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
151 }
152 }
153 else {
154 if ($incremental_mode eq "all") {
155
156 # had it before
157 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
158
159 }
160 else {
161 # Warning in "onlyadd" mode, but had it before!
162 print STDERR "Warning: File $full_curr_file previously imported.\n";
163 print STDERR " Treating as new file\n";
164
165 $block_hash->{'new_files'}->{$full_curr_file} = 1;
166
167 }
168 }
169 }
170 else {
171 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
172 # the new file is the special sort of file greenstone uses
173 # to attach metadata to src documents
174 # i.e metadata.xml
175 # (but note, the filename used is not constrained in
176 # Greenstone to always be this)
177
178 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
179 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
180 }
181 else {
182 $block_hash->{'new_files'}->{$full_curr_file} = 1;
183 }
184 }
185
186
187 delete $block_hash->{'all_files'}->{$curr_file};
188 }
189
190
191
192
193 # Deal with complication of new or modified metadata files by forcing
194 # everything from this point down in the file hierarchy to
195 # be freshly imported.
196 #
197 # This may mean files that have not changed are reindexed, but does
198 # guarantee by the end of processing all new metadata is correctly
199 # associated with the relevant document(s).
200
201 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
202 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
203
204 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
205 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
206
207 # Go through existing_files, and mark anything that is contained
208 # within 'situated_dir' to be reindexed (in case some of the metadata
209 # attaches to one of these files)
210
211 my $reindex_files = [];
212
213 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
214
215 if ($existing_f =~ m/^$situated_dir/) {
216 push(@$reindex_files,$existing_f);
217 $block_hash->{'reindex_files'}->{$existing_f} = 1;
218 delete $block_hash->{'existing_files'}->{$existing_f};
219
220 }
221 }
222
223 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
224 # (or equivalent)
225 $block_hash->{'new_files'}->{$new_mdf} = 1;
226
227 }
228
229 # go through remaining existing files and work out what has changed and needs to be reindexed.
230 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
231
232 my $reindex_files = [];
233
234 foreach my $existing_filename (@existing_files) {
235 if (-M $existing_filename < $archiveinf_timestamp) {
236 # file is newer than last build
237
238 my $existing_file = $existing_filename;
239 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
240
241 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
242 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
243
244 print STDERR "**** Reindexing existing file: $existing_file\n";
245
246 push(@$reindex_files,$existing_file);
247 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
248 }
249
250 }
251
252
253 # By this point full_prev_all_files contains the files
254 # mentioned in archiveinf-src.db but are not in the 'import'
255 # folder (or whatever was specified through -importdir ...)
256
257 # This list can contain files that were created in the 'tmp' or
258 # 'cache' areas (such as screen-size and thumbnail images).
259 #
260 # In building the final list of files to delete, we test to see if
261 # it exists on the filesystem and if it does (unusual for a "normal"
262 # file in import, but possible in the case of 'tmp' files),
263 # supress it from going into the final list
264
265 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
266
267 my @deleted_files = values %$full_prev_all_files;
268 map { my $curr_file = $_;
269 my $full_curr_file = $curr_file;
270
271 if (!&util::filename_is_absolute($curr_file)) {
272 # add in import dir to make absolute
273
274 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
275 }
276
277
278 if (!-e $full_curr_file) {
279 $block_hash->{'deleted_files'}->{$curr_file} = 1;
280 }
281 } @deleted_files;
282
283
284
285}
286
287
288# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
289# $mode is 'delete' or 'reindex'
290sub mark_docs_for_deletion
291{
292 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
293
294 my $mode_text = "deleted from index";
295 if ($mode eq "reindex") {
296 $mode_text = "reindexed";
297 }
298 my $arcinfo_doc_filename = &doc_db_file($archivedir);
299 my $arcinfo_src_filename = &src_db_file($archivedir);
300 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle("gdbm", $arcinfo_doc_filename, "append");
301 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle("gdbm", $arcinfo_src_filename, "append");
302
303
304 # record files marked for deletion in arcinfo
305 foreach my $file (@$deleted_files) {
306 # use 'archiveinf-src' info database file to look up all the OIDs
307 # that this file is used in (note in most cases, it's just one OID)
308
309 my $src_rec_string = &dbutil::read_infodb_entry("gdbm", $arcinfo_src_filename, $file);
310 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
311 my $oids = $src_rec->{'oid'};
312 my $file_record_deleted = 0;
313
314 # delete the src record
315 &dbutil::delete_infodb_entry("gdbm", $src_infodb_file_handle, $file);
316
317 foreach my $oid (@$oids) {
318
319 # find the source doc (the primary file that becomes this oid)
320 my $doc_rec_string = &dbutil::read_infodb_entry("gdbm", $arcinfo_doc_filename, $oid);
321 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
322 my $doc_source_file = $doc_rec->{'src-file'}->[0];
323 if (!&util::filename_is_absolute($doc_source_file)) {
324 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
325 }
326
327 if ($doc_source_file ne $file) {
328 # its an associated or metadata file
329
330 # mark source doc for reimport as one of its assoc files has changed or deleted
331 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
332
333 }
334 my $curr_status = $archive_info->get_status_info($oid);
335 if (defined($curr_status) && (($curr_status ne "D"))) {
336 if ($verbosity>1) {
337 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
338 }
339 # mark oid for deletion (it will be deleted or reimported)
340 $archive_info->set_status_info($oid,"D");
341 my $val = &dbutil::read_infodb_entry("gdbm", $arcinfo_doc_filename, $oid);
342 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
343
344 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
345 &dbutil::write_infodb_entry("gdbm", $doc_infodb_file_handle, $oid, $val_rec);
346 }
347 }
348 }
349
350 &dbutil::close_infodb_write_handle("gdbm", $doc_infodb_file_handle);
351 &dbutil::close_infodb_write_handle("gdbm", $src_infodb_file_handle);
352}
353
354
355
3561;
Note: See TracBrowser for help on using the repository browser.