source: gsdl/trunk/perllib/inexport.pm@ 18457

Last change on this file since 18457 was 18457, checked in by davidb, 15 years ago

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

  • Property svn:executable set to *
File size: 5.1 KB
RevLine 
[18457]1###########################################################################
2#
3# inexport.pm -- useful utilities to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30use util;
31use GDBMUtils;
32
33sub new_vs_old_import_diff
34{
35 my ($archive_info,$block_hash,$importdir) = @_;
36
37 # First convert all files to absolute form
38 # This is to support the situation where the import folder is not
39 # the default
40
41 my $prev_all_files = $archive_info->{'prev_import_filelist'};
42 my $full_prev_all_files = {};
43
44 foreach my $prev_file (keys %$prev_all_files) {
45
46 if (!&util::filename_is_absolute($prev_file)) {
47 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
48 $full_prev_all_files->{$full_prev_file} = $prev_file;
49 }
50 else {
51 $full_prev_all_files->{$prev_file} = $prev_file;
52 }
53 }
54
55 # Figure out which are the new files, existing files and so
56 # by implication the files from the previous import that are not
57 # there any more => mark them for deletion
58 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
59
60 my $full_curr_file = $curr_file;
61
62 # entry in 'all_files' is moved to either 'existing_files',
63 # 'deleted_files', or 'new_files'
64
65 if (!&util::filename_is_absolute($curr_file)) {
66 # add in import dir to make absolute
67 $full_curr_file = &util::filename_cat($importdir,$curr_file);
68 }
69
70 # figure of if new file or not
71 if (defined $full_prev_all_files->{$full_curr_file}) {
72 # had it before
73 $block_hash->{'existing_files'}->{$curr_file} = 1;
74 # Now remove it, so by end of loop only the files
75 # that need deleting are left
76
77 delete $full_prev_all_files->{$full_curr_file};
78 }
79 else {
80 $block_hash->{'new_files'}->{$curr_file} = 1;
81 }
82
83 delete $block_hash->{'all_files'}->{$curr_file};
84 }
85
86 # By this point full_prev_all_files contains only the files
87 # that are not in the current import folder => i.e. files
88 # to be deleted
89 #
90 # The value in each key is its "local" import file name, which is what
91 # we want to use
92 my @deleted_files = values %$full_prev_all_files;
93 map { $block_hash->{'deleted_files'}->{$_} = 1 } @deleted_files;
94}
95
96sub mark_docs_for_deletion
97{
98 my ($archive_info,$deleted_files_ref,$archivedir,$verbosity) = @_;
99
100 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
101 my $doc_db = "archiveinf-doc$db_ext";
102 my $src_db = "archiveinf-src$db_ext";
103 my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
104 my $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db);
105
106
107 # record files marked for deletion in arcinfo
108 foreach my $file (@$deleted_files_ref) {
109 # use 'archiveinf-src' GDBM file to look up all the OIDs
110 # this file is used in (note in most cases, it's just one OID)
111
112 # An improvement would be to have the record read
113 # into a hash array
114 my $src_rec = GDBMUtils::gdbmRecordToHash($arcinfo_src_filename,$file);
115 my $oids = $src_rec->{'oid'};
116 foreach my $oid (@$oids) {
117
118 # find out if it's an assoc file or main doc
119
120 my $doc_rec = GDBMUtils::gdbmRecordToHash($arcinfo_doc_filename,$oid);
121## print STDERR "file = $file\n";
122
123 if ($doc_rec->{'src-file'}->[0] eq $file) {
124 # mark it for deletion
125 if ($verbosity>1) {
126 print STDERR "$oid marked to be deleted\n";
127 }
128 $archive_info->set_status_info($oid,"D");
129
130 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
131 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
132 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
133 }
134 else {
135 # assoc file => mark it for re-indexing (safest thing to do)
136 my $curr_status = $archive_info->get_status_info($oid);
137
138
139 if (defined($curr_status) && (($curr_status ne "D") && ($curr_status ne "R"))) {
140 if ($verbosity > 1) {
141 print STDERR "$oid marked for (potential) reindexing\n";
142 print STDERR " (because associated file $file deleted)\n";
143 }
144 $archive_info->set_status_info($oid,"R");
145
146 my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
147 $val =~ s/^<index-status>(.*)$/<index-status>R/m;
148 &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
149 }
150 }
151 }
152 }
153}
154
155
156
1571;
Note: See TracBrowser for help on using the repository browser.