source: main/trunk/greenstone2/perllib/inexport.pm@ 21620

Last change on this file since 21620 was 21620, checked in by mdewsnip, 14 years ago

Changed inexport.pm to use the infodbtype value from the collect.cfg file instead of hard-wiring GDBM. Part of making the code less GDBM-specific.

  • Property svn:executable set to *
File size: 11.3 KB
RevLine 
[18457]1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
[19789]30use File::Basename;
31
[21553]32use dbutil;
[18457]33use util;
34
[21563]35
[21306]36sub oid_count_file {
37 my ($archivedir) = @_;
38 return &util::filename_cat ($archivedir, "OIDcount");
39}
40
41
[18528]42sub prime_doc_oid_count
43{
44 my ($archivedir) = @_;
[21306]45 my $oid_count_filename = &oid_count_file($archivedir);
[18528]46
47 if (-e $oid_count_filename) {
48 if (open(OIDIN,"<$oid_count_filename")) {
49 my $OIDcount = <OIDIN>;
50 chomp $OIDcount;
51 close(OIDIN);
52
53 $doc::OIDcount = $OIDcount;
54 }
55 else {
56
57 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
58 print STDERR "Setting value to 0\n";
59 }
60 }
61
62}
63
64sub store_doc_oid_count
65{
66 # Use the file "OIDcount" in the archives directory to record
67 # what value doc.pm got up to
68
69 my ($archivedir) = @_;
[21306]70 my $oid_count_filename = &oid_count_file($archivedir);
[18528]71
72
73 if (open(OIDOUT,">$oid_count_filename")) {
74 print OIDOUT $doc::OIDcount, "\n";
75
76 close(OIDOUT);
77 }
78 else {
79 print STDERR "Warning: unable to store document OID count\n";
80 }
81}
82
83
84
[18457]85sub new_vs_old_import_diff
86{
[20578]87 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
[18457]88
[21620]89 # Get the infodbtype value for this collection from the arcinfo object
90 my $infodbtype = $archive_info->{'infodbtype'};
91
[20776]92 # in this method, we want to know if metadata files are modified or not.
[21620]93 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
[20776]94
95 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
96
[18457]97 # First convert all files to absolute form
98 # This is to support the situation where the import folder is not
99 # the default
100
101 my $prev_all_files = $archive_info->{'prev_import_filelist'};
102 my $full_prev_all_files = {};
103
104 foreach my $prev_file (keys %$prev_all_files) {
105
106 if (!&util::filename_is_absolute($prev_file)) {
107 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
108 $full_prev_all_files->{$full_prev_file} = $prev_file;
109 }
110 else {
111 $full_prev_all_files->{$prev_file} = $prev_file;
112 }
113 }
114
[18469]115
[18457]116 # Figure out which are the new files, existing files and so
117 # by implication the files from the previous import that are not
118 # there any more => mark them for deletion
119 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
120
121 my $full_curr_file = $curr_file;
122
123 # entry in 'all_files' is moved to either 'existing_files',
[20776]124 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
[18457]125
126 if (!&util::filename_is_absolute($curr_file)) {
127 # add in import dir to make absolute
128 $full_curr_file = &util::filename_cat($importdir,$curr_file);
129 }
130
[19498]131 # figure out if new file or not
[18457]132 if (defined $full_prev_all_files->{$full_curr_file}) {
[20776]133 # delete it so that only files that need deleting are left
134 delete $full_prev_all_files->{$full_curr_file};
135
136 # had it before. is it a metadata file?
137 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
[20578]138
[20776]139 # is it modified??
140 if (-M $full_curr_file < $archiveinf_timestamp) {
141 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
142 # its newer than last build
143 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
144 }
[20578]145 }
146 else {
[20776]147 if ($incremental_mode eq "all") {
148
149 # had it before
150 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
151
152 }
153 else {
154 # Warning in "onlyadd" mode, but had it before!
155 print STDERR "Warning: File $full_curr_file previously imported.\n";
156 print STDERR " Treating as new file\n";
157
158 $block_hash->{'new_files'}->{$full_curr_file} = 1;
159
160 }
[20578]161 }
162 }
163 else {
164 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
165 # the new file is the special sort of file greenstone uses
166 # to attach metadata to src documents
167 # i.e metadata.xml
168 # (but note, the filename used is not constrained in
169 # Greenstone to always be this)
[18457]170
[20776]171 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
172 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
[20578]173 }
174 else {
175 $block_hash->{'new_files'}->{$full_curr_file} = 1;
176 }
[18457]177 }
[20578]178
[18457]179
180 delete $block_hash->{'all_files'}->{$curr_file};
181 }
182
[20578]183
[21306]184
185
[20776]186 # Deal with complication of new or modified metadata files by forcing
[20578]187 # everything from this point down in the file hierarchy to
188 # be freshly imported.
189 #
190 # This may mean files that have not changed are reindexed, but does
191 # guarantee by the end of processing all new metadata is correctly
192 # associated with the relevant document(s).
193
[20776]194 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
[20578]195 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
196
197 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
[20769]198 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
199
[20578]200 # Go through existing_files, and mark anything that is contained
201 # within 'situated_dir' to be reindexed (in case some of the metadata
202 # attaches to one of these files)
203
204 my $reindex_files = [];
205
206 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
[20769]207
[20578]208 if ($existing_f =~ m/^$situated_dir/) {
209 push(@$reindex_files,$existing_f);
210 $block_hash->{'reindex_files'}->{$existing_f} = 1;
[21306]211 delete $block_hash->{'existing_files'}->{$existing_f};
[20578]212
213 }
214 }
215
216 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
217 # (or equivalent)
218 $block_hash->{'new_files'}->{$new_mdf} = 1;
219
220 }
221
[21306]222 # go through remaining existing files and work out what has changed and needs to be reindexed.
223 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
224
225 my $reindex_files = [];
226
227 foreach my $existing_filename (@existing_files) {
228 if (-M $existing_filename < $archiveinf_timestamp) {
229 # file is newer than last build
230
231 my $existing_file = $existing_filename;
232 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
233
234 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
235 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
236
237 print STDERR "**** Reindexing existing file: $existing_file\n";
238
239 push(@$reindex_files,$existing_file);
240 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
241 }
242
243 }
244
[20578]245
[18469]246 # By this point full_prev_all_files contains the files
247 # mentioned in archiveinf-src.db but are not in the 'import'
248 # folder (or whatever was specified through -importdir ...)
249
250 # This list can contain files that were created in the 'tmp' or
251 # 'cache' areas (such as screen-size and thumbnail images).
[18457]252 #
[18469]253 # In building the final list of files to delete, we test to see if
[20578]254 # it exists on the filesystem and if it does (unusual for a "normal"
255 # file in import, but possible in the case of 'tmp' files),
256 # supress it from going into the final list
[18469]257
258 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
259
[18457]260 my @deleted_files = values %$full_prev_all_files;
[18469]261 map { my $curr_file = $_;
262 my $full_curr_file = $curr_file;
263
264 if (!&util::filename_is_absolute($curr_file)) {
265 # add in import dir to make absolute
266
267 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
268 }
269
270
271 if (!-e $full_curr_file) {
272 $block_hash->{'deleted_files'}->{$curr_file} = 1;
273 }
274 } @deleted_files;
[20578]275
276
277
[18457]278}
279
[19498]280
[20788]281# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
[21306]282# $mode is 'delete' or 'reindex'
283sub mark_docs_for_deletion
[18457]284{
[21306]285 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
[18457]286
[21306]287 my $mode_text = "deleted from index";
288 if ($mode eq "reindex") {
289 $mode_text = "reindexed";
290 }
[18457]291
[21620]292 # Get the infodbtype value for this collection from the arcinfo object
293 my $infodbtype = $archive_info->{'infodbtype'};
[18457]294
[21620]295 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
296 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
297 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
298 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
299
[18457]300 # record files marked for deletion in arcinfo
[19498]301 foreach my $file (@$deleted_files) {
[21564]302 # use 'archiveinf-src' info database file to look up all the OIDs
[19789]303 # that this file is used in (note in most cases, it's just one OID)
[18457]304
[21620]305 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
[21554]306 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
[18457]307 my $oids = $src_rec->{'oid'};
[20776]308 my $file_record_deleted = 0;
[20788]309
310 # delete the src record
[21620]311 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
[20788]312
[18457]313 foreach my $oid (@$oids) {
314
[20788]315 # find the source doc (the primary file that becomes this oid)
[21620]316 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
[21554]317 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
[20776]318 my $doc_source_file = $doc_rec->{'src-file'}->[0];
319 if (!&util::filename_is_absolute($doc_source_file)) {
320 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
321 }
[18457]322
[20788]323 if ($doc_source_file ne $file) {
324 # its an associated or metadata file
325
[20776]326 # mark source doc for reimport as one of its assoc files has changed or deleted
327 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
[20788]328
[18457]329 }
[20788]330 my $curr_status = $archive_info->get_status_info($oid);
331 if (defined($curr_status) && (($curr_status ne "D"))) {
332 if ($verbosity>1) {
333 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
[19498]334 }
[20788]335 # mark oid for deletion (it will be deleted or reimported)
336 $archive_info->set_status_info($oid,"D");
[21620]337 my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
[20788]338 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
[21557]339
340 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
[21620]341 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
[19498]342 }
[18457]343 }
344 }
[21560]345
[21620]346 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
347 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
[18457]348}
349
350
[18554]351
[18457]3521;
Note: See TracBrowser for help on using the repository browser.