source: gsdl/trunk/perllib/IncrementalBuildUtils.pm@ 17110

Last change on this file since 17110 was 17087, checked in by davidb, 16 years ago

Introduction of new GDBM alternative for archives.inf as step towards full incremental building. Information traditionally stored in archives.inf PLUS additional information that will help with working out what files have changed since last build, and what doc-id they hashed to is stored in two GDBM databases. For now these databases aren't read, but in the future ArchivesInfPlugin will be upgraded to use these to support these.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.6 KB
Line 
1###########################################################################
2#
3# IncrementalBuildUtils.pm -- API to assist incremental building
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25# /** Initial versions of these functions by John Thompson, revisions by
26# * and turning it into a package by John Rowe. Used heavily by
27# * basebuilder::remove_document() and getdocument.pl
28# *
29# * @version 1.0 Initial version by John Thompson
30# * @version 1.1 Addition of get_document and change of get_document_as_xml
31# * by John Rowe
32# * @version 2.0 Package version including seperation from calling code and
33# * modularisation by creating gdbmget, gdbmset and
34# * get_database_path by John Rowe
35# *
36# * @author John Thompson, DL Consulting Ltd.
37# * @author John Rowe, DL Consulting Ltd.
38# */
39###########################################################################
40package IncrementalBuildUtils;
41
42BEGIN {
43 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
44 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
45 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
46 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
47 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
49}
50
51use doc;
52use cfgread;
53use colcfg;
54use strict;
55use util;
56
57use ClassifyTreeModel;
58use IncrementalDocument;
59
60# Change debugging to 1 if you want verbose debugging output
61my $debug = 1;
62
63# Ensure the collection specific binaries are on the search path
64my $path_separator = ":";
65if($ENV{'GSDLOS'} =~ /win/) {
66 $path_separator = ";";
67}
68$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}) . $path_separator . &util::filename_cat($ENV{'GSDLHOME'}, "bin", "script") . $path_separator.$ENV{'PATH'};
69
70# /**
71# */
72sub addDocument()
73 {
74 my ($collection, $doc_obj, $section, $updateindex) = @_;
75
76 $updateindex = 0 unless defined($updateindex);
77
78 print STDERR "IncrementalBuildUtils::addDocument('$collection',$doc_obj,'$section')\n" unless !$debug;
79 # Gonna need to know in several places whether this is the top section
80 # of the document or not
81 my $is_top = ($section eq $doc_obj->get_top_section());
82
83 # Retrieve all of the metadata from this document object only - not any
84 # child documents
85 my $metadata = $doc_obj->get_all_metadata($section);
86 # Check and add the docnum first
87 my $found_docnum = 0;
88 foreach my $pair (@$metadata)
89 {
90 my ($key, $value) = (@$pair);
91 if ($key eq "docnum")
92 {
93 &setDocumentMetadata($collection, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
94 $found_docnum = 1;
95 }
96 }
97
98 if (!$found_docnum)
99 {
100 die("Fatal Error! Tried to add document without providing docnum");
101 }
102
103 # Add it piece by piece - this depends on the loading of a blank document
104 # working the way it should.
105 foreach my $pair (@$metadata)
106 {
107 my ($key, $value) = (@$pair);
108 if ($key ne "Identifier" && $key ne "docnum" && $key !~ /^gsdl/ && defined $value && $value ne "")
109 {
110 # escape problematic stuff
111 $value =~ s/\\/\\\\/g;
112 $value =~ s/\n/\\n/g;
113 $value =~ s/\r/\\r/g;
114 if ($value =~ /-{70,}/)
115 {
116 # if value contains 70 or more hyphens in a row we need
117 # to escape them to prevent txt2db from treating them
118 # as a separator
119 $value =~ s/-/&\#045;/gi;
120 }
121 # Go ahead and set the metadata
122 &setDocumentMetadata($collection, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
123 }
124 }
125 # We now have to load the browselist node too. We create a ClassifyTreeNode
126 # based on a dummy model.
127 # Note: only if section is the top section
128 if ($is_top)
129 {
130 my $dummy_model = new ClassifyTreeModel($collection, "");
131 my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
132 # Add the document
133 $browselist_node->addDocument($doc_obj->get_OID());
134 }
135 # We now recursively move through the document objects child sections,
136 # adding them too. As we do this we build up a contains list for this
137 # document.
138 my $section_ptr = $doc_obj->_lookup_section($section);
139 my @contains = ();
140 if (defined $section_ptr)
141 {
142 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
143 &addDocument($collection, $doc_obj, "$section.$subsection");
144 push(@contains, "\".$subsection");
145 }
146 }
147 # Done - clean up
148 }
149# /** addDocument() **/
150
151# /** Sets the metadata attached to a given document. This will update, at most,
152# * three different locations:
153# * 1. The Lucene index must be updated. This will involve removing any
154# * existing value and, if required, adding a new value in its place.
155# * 2. The GDBM database must be updated. Again any existing value will be
156# * removed and, if required, a new value added.
157# * 3. Finally a check against the collect.cfg will be done to determine if
158# * the changed metadata would have an effect on a classifier and, if so
159# * the classifier tree will be updated to remove, add or replace any
160# * tree nodes or node 'contains lists' as necessary.
161# *
162# * Pseudo Code:
163# * ------------
164# * To add metadata to the document NT1
165# * A. Establish connection to Lucene
166# * B. Create a IncrementalDocument object for 'NT1' loading the information
167# * from the GDBM
168# * C. Check to see if this metadata is used to build a classifier(s) and if
169# * so create the appropriate ClassifyTreeModel(s)
170# * D. If removing or replacing metadata:
171# * i/ Call ??? to remove key-value from Lucene index
172# * ii/ Use removeMetadata() to clear value in IncrementalDocument
173# * iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
174# * E. If adding or replacing metadata:
175# * i/ Call ??? to add key-value from Lucene index
176# * ii/ Use addMetadata() to add value in IncrementalDocument
177# * iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
178# * F. Complete Lucene transaction
179# * G. Save IncrementalDocument to GDBM
180# * Note: ClassifyTreeModel automatically updates GDBM as necessary.
181# *
182# * @param $collection The name of the collection to update as a string
183# * @param $oid The unique identifier of a Greenstone document as a
184# * string
185# * @param $key The key of the metadata being added as a string
186# * @param $old_value The value of the metadata being removed/replaced
187# * or an empty string if adding metadata
188# * @param $new_value The value of the metadata being added/replacing
189# * or an empty string if removing metadata
190# * @param $updateindex 1 to get the index updated. This is used to prevent
191# * the indexes being changed when doing an incremental
192# * addition of a new document.
193# *
194# * @author John Thompson, DL Consulting Ltd.
195# */
196sub setDocumentMetadata()
197 {
198 my ($collection, $oid, $key, $old_value, $new_value, $updateindex) = @_;
199 print STDERR "IncrementalBuildUtils::setDocumentMetadata('$collection','$oid','$key','$old_value','$new_value',$updateindex)\n" unless !$debug;
200 # A. Establish connection to Lucene
201 # This isn't required at the moment, but might be later if we implement
202 # Lucene daemon.
203 # B. Create a IncrementalDocument object for 'NT1' loading the information
204 # from the GDBM
205 print STDERR "* creating incremental document for $oid\n" unless !$debug;
206 my $doc_obj = new IncrementalDocument($collection, $oid);
207 $doc_obj->loadDocument();
208 # C. Check to see if this metadata is used to build a classifier(s) and if
209 # so create the appropriate ClassifyTreeModel(s)
210 print STDERR "* load collection configuration\n" unless !$debug;
211 my $config_obj = &getConfigObj($collection);
212 my $clidx = 1;
213 my @classifier_tree_models = ();
214 foreach my $classifier (@{$config_obj->{'classify'}})
215 {
216 my $index = 0;
217 my $option_count = scalar(@{$classifier});
218 for ($index = 0; $index < $option_count; $index++)
219 {
220 if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata" && @{$classifier}[$index + 1] eq $key)
221 {
222 # Create a tree model for this classifier
223 print STDERR "* creating a tree model for classifier: CL$clidx\n" unless !$debug;
224 my $tree_model_obj = new ClassifyTreeModel($collection, "CL" . $clidx);
225 # And store it for later
226 push(@classifier_tree_models, $tree_model_obj);
227 }
228 }
229 $clidx++;
230 }
231 # D. If removing or replacing metadata:
232 if (defined($old_value) && $old_value =~ /[\w\d]+/)
233 {
234 print STDERR "* removing '$old_value' from GDBM database\n" unless !$debug;
235 # i/ Call ??? to remove key-value from Lucene index
236 # Moved elsewhere
237 # ii/ Use removeMetadata() to clear value in IncrementalDocument
238 $doc_obj->removeMetadata($key, $old_value);
239 # iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
240 foreach my $classifier_tree_model (@classifier_tree_models)
241 {
242 print STDERR "* removing '$old_value' from classifier tree\n" unless !$debug;
243 $classifier_tree_model->removeDocument($old_value, $oid, 1);
244 }
245 }
246 # E. If adding or replacing metadata:
247 if (defined($new_value) && $new_value =~ /[\w\d]+/)
248 {
249 print STDERR "* adding '$new_value' to GDBM database\n" unless !$debug;
250 # i/ Call ??? to add key-value from Lucene index
251 # Moved elsewhere
252 # ii/ Use addMetadata() to add value in IncrementalDocument
253 $doc_obj->addMetadata($key, $new_value);
254 # iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
255 foreach my $classifier_tree_model (@classifier_tree_models)
256 {
257 print STDERR "* adding '$new_value' to classifier tree\n" unless !$debug;
258 $classifier_tree_model->addDocument($new_value, $oid);
259 }
260 }
261 # F. Complete Lucene transaction
262 if(defined($updateindex) && $updateindex)
263 {
264 print STDERR "* updating Lucene indexes\n" unless !$debug;
265 &callGS2LuceneEditor($collection, $doc_obj->getDocNum, $key, $old_value, $new_value);
266 }
267 # G. Save IncrementalDocument to GDBM
268 $doc_obj->saveDocument();
269 $doc_obj = 0;
270 }
271# /** setDocumentMetadata() **/
272
273# /**
274# *
275# */
276sub callGS2LuceneDelete()
277 {
278 my ($collection, $docnum) = @_;
279
280 # Some path information that is the same for all indexes
281 my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
282 my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneDelete";
283 my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
284 # Determine what indexes need to be changed by opening the collections
285 # index path and searching for directories named *idx
286 # If the directory doesn't exist, then there is no built index, and nothing
287 # for us to do.
288 if(opendir(INDEXDIR, $indexpath))
289 {
290 my @index_files = readdir(INDEXDIR);
291 closedir(INDEXDIR);
292 # For each index that matches or pattern, we call the java application
293 # to change the index (as necessary - not every index will include the
294 # document we have been asked to modify)
295 foreach my $actual_index_dir (@index_files)
296 {
297 next unless $actual_index_dir =~ /idx$/;
298 # Determine the path to the index to modify
299 my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
300 # Call java to remove the document
301 my $cmd = "java -classpath \"$classpath\" $java_lucene --index $full_index_dir --nodeid $docnum";
302 print STDERR "CMD: " . $cmd . "\n" unless !$debug;
303 # Run command
304 my $result = `$cmd 2>&1`;
305 print STDERR $result unless !$debug;
306 }
307 }
308 # Done
309 }
310# /** callGS2LuceneDelete() **/
311
312# /**
313# */
314sub callGS2LuceneEditor()
315 {
316 my ($collection, $docnum, $key, $old_value, $new_value) = @_;
317
318 # Some path information that is the same for all indexes
319 my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"java","classes");
320 my $jarpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
321 my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneEditor";
322 my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
323 # And some commands that don't change
324 my $java_args = "";
325 # Append the node id
326 $java_args .= "--nodeid $docnum ";
327 # We have to convert the given metadata key into its two letter field code.
328 # We do this by looking in the build.cfg file.
329 my $field = &getFieldFromBuildCFG($indexpath, $key);
330 # The metadata field to change
331 $java_args .= "--field $field ";
332 # And the old and new values as necessary
333 if(defined($old_value) && $old_value =~ /[\w\d]+/)
334 {
335 $java_args .= "--oldvalue \"$old_value\" ";
336 }
337 if(defined($new_value) && $new_value =~ /[\w\d]+/)
338 {
339 $java_args .= "--newvalue \"$new_value\" ";
340 }
341 # Determine what indexes need to be changed by opening the collections
342 # index path and searching for directories named *idx
343 # If the directory doesn't exist, then there is no built index, and nothing
344 # for us to do.
345 # We also check if the field is something other than "". It is entirely
346 # possible that we have been asked to update a metadata field that isn't
347 # part of any index, so this is where we break out of editing the index if
348 # we have
349 if($field =~ /^\w\w$/ && opendir(INDEXDIR, $indexpath))
350 {
351 my @index_files = readdir(INDEXDIR);
352 closedir(INDEXDIR);
353 # For each index that matches or pattern, we call the java application
354 # to change the index (as necessary - not every index will include the
355 # document we have been asked to modify)
356 foreach my $actual_index_dir (@index_files)
357 {
358 next unless $actual_index_dir =~ /idx$/;
359 # Determine the path to the index to modify
360 my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
361 # And prepend to the command java arguments
362 my $cur_java_args = "--index $full_index_dir " . $java_args;
363 print STDERR "CMD: java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1\n" unless !$debug;
364 # Run command
365 my $result = `java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1`;
366 print STDERR $result unless !$debug;
367 }
368 }
369 # Done
370 }
371# /** callGS2LuceneEditor() **/
372
373## Remove a document from the GDBM and Index.
374#
375# @param collection The collection to alter
376# @param oid The unique identifier of the document to be removed
377##
378sub deleteDocument()
379 {
380 my ($collection, $oid) = @_;
381 # Load the incremental document to go with this oid, as we need some
382 # information from it.
383 my $doc_obj = new IncrementalDocument($collection, $oid);
384 $doc_obj->loadDocument();
385 # Check if this object even exists by retrieving the docnum.
386 my $doc_num = $doc_obj->getDocNum();
387 print STDERR "Removing document docnum: $doc_num\n" unless !$debug;
388 if ($doc_num > -1)
389 {
390 # Now use the GDBM utils to write a blank string to this oid in the
391 # database
392 &GDBMUtils::gdbmCachedCollectionSet($collection, $oid, "");
393 # Remove reverse lookup
394 &GDBMUtils::gdbmCachedCollectionSet($collection, $doc_num, "");
395 # And remove from the database
396 &callGS2LuceneDelete($collection, $doc_num);
397
398 # Regenerate the classifier trees.
399 print STDERR "* load collection configuration\n";# unless !$debug;
400 my $config_obj = &getConfigObj($collection);
401 my $clidx = 1;
402 my %classifier_tree_models = ();
403 foreach my $classifier (@{$config_obj->{'classify'}})
404 {
405 my $index = 0;
406 my $option_count = scalar(@{$classifier});
407 for ($index = 0; $index < $option_count; $index++)
408 {
409 if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata")
410 {
411 my $key = @{$classifier}[$index + 1];
412 # Create a tree model for this classifier
413 print STDERR "* creating a tree model for classifier: CL" . $clidx . " [" . $key . "]\n";# unless !$debug;
414 my $tree_model_obj = new ClassifyTreeModel($collection, "CL" . $clidx);
415 # And store it against its key for later
416 $classifier_tree_models{$key} = $tree_model_obj;
417 }
418 }
419 $clidx++;
420 }
421
422 # For each piece of metadata assigned to this document, if there is a
423 # matching classifier tree, remove the path from the tree.
424 print STDERR "* searching for classifier paths to be removed\n";
425
426 my $metadata = $doc_obj->getAllMetadata();
427 foreach my $pair (@$metadata)
428 {
429 my ($key, $value) = @$pair;
430 print STDERR "* testing " . $key . "=>" . $value . "\n";
431 if (defined($classifier_tree_models{$key}))
432 {
433 my $model = $classifier_tree_models{$key};
434 print STDERR "* removing '" . $value . "' from classifier " . $model->getRootNode()->getCLID() . "\n";
435 $model->removeDocument($value, $oid, 1);
436 }
437 }
438
439 # We also have to remove from browselist - the reverse process of
440 # adding to browselist shown above.
441 my $dummy_model = new ClassifyTreeModel($collection, "");
442 my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
443 # Add the document
444 $browselist_node->removeDocument($oid);
445 # Clean up
446 }
447 # else, no document, no need to delete.
448 }
449## deleteDocument() ##
450
451# /**
452# */
453sub getFieldFromBuildCFG()
454 {
455 my ($indexpath, $key) = @_;
456 my $field = "";
457 my $build_cfg = &util::filename_cat($indexpath, "build.cfg");
458 # If there isn't a build.cfg then the index hasn't been built and there is
459 # nothing to do
460 if(open(BUILDCFG, $build_cfg))
461 {
462 # For each line of the build configuration
463 my $line;
464 while($line = <BUILDCFG>)
465 {
466 # Only interested in the indexfieldmap line
467 if($line =~ /^indexfieldmap\s+/)
468 {
469 # Extract the field information by looking up the key pair
470 if($line =~ /\s$key->(\w\w)/)
471 {
472 $field = $1;
473 }
474 }
475 }
476 # Done with file
477 close(BUILDCFG);
478 }
479 # Return whatever we found
480 return $field;
481 }
482# /** getFieldFromBuildCFG() **/
483
484
485
486
487
488# /** Retrieve an object (associative array) containing information about the
489# * collection configuration.
490# * @param $collection The shortname of the collection as a string
491# * @return An associative array containing information from the collect.cfg
492# * @author John Thompson, DL Consulting Ltd.
493# */
494sub getConfigObj()
495 {
496 my ($collection) = @_;
497
498 #rint STDERR "getConfigObj()\n" unless !$debug;
499
500 my $colcfgname = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "etc", "collect.cfg");
501 if (!-e $colcfgname)
502 {
503 die "IncrementalBuildUtils - couldn't find collect.cfg for collection $collection\n";
504 }
505 return &colcfg::read_collect_cfg ($colcfgname);
506 }
507# /** getConfigObj() **/
508
5091;
Note: See TracBrowser for help on using the repository browser.