source: gsdl/trunk/perllib/IncrementalBuildUtils.pm@ 14374

Last change on this file since 14374 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.5 KB
Line 
1###########################################################################
2#
3# IncrementalBuildUtils.pm -- API to assist incremental building
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25# /** Initial versions of these functions by John Thompson, revisions by
26# * and turning it into a package by John Rowe. Used heavily by
27# * basebuilder::remove_document() and getdocument.pl
28# *
29# * @version 1.0 Initial version by John Thompson
30# * @version 1.1 Addition of get_document and change of get_document_as_xml
31# * by John Rowe
32# * @version 2.0 Package version including seperation from calling code and
33# * modularisation by creating gdbmget, gdbmset and
34# * get_database_path by John Rowe
35# *
36# * @author John Thompson, DL Consulting Ltd.
37# * @author John Rowe, DL Consulting Ltd.
38# */
39###########################################################################
40package IncrementalBuildUtils;
41
42BEGIN {
43 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
44 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
45 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
46 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
47 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
49}
50
51use doc;
52use cfgread;
53use colcfg;
54use util;
55
56use ClassifyTreeModel;
57use IncrementalDocument;
58
59# Change debugging to 1 if you want verbose debugging output
60$debug = 1;
61
62# Ensure the collection specific binaries are on the search path
63my $path_separator = ":";
64if($ENV{'GSDLOS'} =~ /win/) {
65 $path_separator = ";";
66}
67$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}) . $path_separator . &util::filename_cat($ENV{'GSDLHOME'}, "bin", "script") . $path_separator.$ENV{'PATH'};
68
69# /**
70# */
71sub addDocument()
72 {
73 my ($collection, $doc_obj, $section, $updateindex) = @_;
74
75 $updateindex = 0 unless defined($updateindex);
76
77 print STDERR "IncrementalBuildUtils::addDocument('$collection',$doc_obj,'$section')\n" unless !$debug;
78 # Gonna need to know in several places whether this is the top section
79 # of the document or not
80 my $is_top = ($section eq $doc_obj->get_top_section());
81
82 # Retrieve all of the metadata from this document object only - not any
83 # child documents
84 my $metadata = $doc_obj->get_all_metadata($section);
85 # Check and add the docnum first
86 my $found_docnum = 0;
87 foreach my $pair (@$metadata)
88 {
89 my ($key, $value) = (@$pair);
90 if ($key eq "docnum")
91 {
92 &setDocumentMetadata($collection, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
93 $found_docnum = 1;
94 }
95 }
96
97 if (!$found_docnum)
98 {
99 die("Fatal Error! Tried to add document without providing docnum");
100 }
101
102 # Add it piece by piece - this depends on the loading of a blank document
103 # working the way it should.
104 foreach my $pair (@$metadata)
105 {
106 my ($key, $value) = (@$pair);
107 if ($key ne "Identifier" && $key ne "docnum" && $key !~ /^gsdl/ && defined $value && $value ne "")
108 {
109 # escape problematic stuff
110 $value =~ s/\\/\\\\/g;
111 $value =~ s/\n/\\n/g;
112 $value =~ s/\r/\\r/g;
113 if ($value =~ /-{70,}/)
114 {
115 # if value contains 70 or more hyphens in a row we need
116 # to escape them to prevent txt2db from treating them
117 # as a separator
118 $value =~ s/-/&\#045;/gi;
119 }
120 # Go ahead and set the metadata
121 &setDocumentMetadata($collection, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
122 }
123 }
124 # We now have to load the browselist node too. We create a ClassifyTreeNode
125 # based on a dummy model.
126 # Note: only if section is the top section
127 if ($is_top)
128 {
129 my $dummy_model = new ClassifyTreeModel($collection, "");
130 my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
131 # Add the document
132 $browselist_node->addDocument($doc_obj->get_OID());
133 }
134 # We now recursively move through the document objects child sections,
135 # adding them too. As we do this we build up a contains list for this
136 # document.
137 my $section_ptr = $doc_obj->_lookup_section($section);
138 my @contains = ();
139 if (defined $section_ptr)
140 {
141 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
142 &addDocument($collection, $doc_obj, "$section.$subsection");
143 push(@contains, "\".$subsection");
144 }
145 }
146 # Done - clean up
147 }
148# /** addDocument() **/
149
150# /** Sets the metadata attached to a given document. This will update, at most,
151# * three different locations:
152# * 1. The Lucene index must be updated. This will involve removing any
153# * existing value and, if required, adding a new value in its place.
154# * 2. The GDBM database must be updated. Again any existing value will be
155# * removed and, if required, a new value added.
156# * 3. Finally a check against the collect.cfg will be done to determine if
157# * the changed metadata would have an effect on a classifier and, if so
158# * the classifier tree will be updated to remove, add or replace any
159# * tree nodes or node 'contains lists' as necessary.
160# *
161# * Pseudo Code:
162# * ------------
163# * To add metadata to the document NT1
164# * A. Establish connection to Lucene
165# * B. Create a IncrementalDocument object for 'NT1' loading the information
166# * from the GDBM
167# * C. Check to see if this metadata is used to build a classifier(s) and if
168# * so create the appropriate ClassifyTreeModel(s)
169# * D. If removing or replacing metadata:
170# * i/ Call ??? to remove key-value from Lucene index
171# * ii/ Use removeMetadata() to clear value in IncrementalDocument
172# * iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
173# * E. If adding or replacing metadata:
174# * i/ Call ??? to add key-value from Lucene index
175# * ii/ Use addMetadata() to add value in IncrementalDocument
176# * iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
177# * F. Complete Lucene transaction
178# * G. Save IncrementalDocument to GDBM
179# * Note: ClassifyTreeModel automatically updates GDBM as necessary.
180# *
181# * @param $collection The name of the collection to update as a string
182# * @param $oid The unique identifier of a Greenstone document as a
183# * string
184# * @param $key The key of the metadata being added as a string
185# * @param $old_value The value of the metadata being removed/replaced
186# * or an empty string if adding metadata
187# * @param $new_value The value of the metadata being added/replacing
188# * or an empty string if removing metadata
189# * @param $updateindex 1 to get the index updated. This is used to prevent
190# * the indexes being changed when doing an incremental
191# * addition of a new document.
192# *
193# * @author John Thompson, DL Consulting Ltd.
194# */
195sub setDocumentMetadata()
196 {
197 my ($collection, $oid, $key, $old_value, $new_value, $updateindex) = @_;
198 print STDERR "IncrementalBuildUtils::setDocumentMetadata('$collection','$oid','$key','$old_value','$new_value',$updateindex)\n" unless !$debug;
199 # A. Establish connection to Lucene
200 # This isn't required at the moment, but might be later if we implement
201 # Lucene daemon.
202 # B. Create a IncrementalDocument object for 'NT1' loading the information
203 # from the GDBM
204 print STDERR "* creating incremental document for $oid\n" unless !$debug;
205 my $doc_obj = new IncrementalDocument($collection, $oid);
206 $doc_obj->loadDocument();
207 # C. Check to see if this metadata is used to build a classifier(s) and if
208 # so create the appropriate ClassifyTreeModel(s)
209 print STDERR "* load collection configuration\n" unless !$debug;
210 my $config_obj = &getConfigObj($collection);
211 my $clidx = 1;
212 my @classifier_tree_models = ();
213 foreach my $classifier (@{$config_obj->{'classify'}})
214 {
215 my $index = 0;
216 my $option_count = scalar(@{$classifier});
217 for ($index = 0; $index < $option_count; $index++)
218 {
219 if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata" && @{$classifier}[$index + 1] eq $key)
220 {
221 # Create a tree model for this classifier
222 print STDERR "* creating a tree model for classifier: CL$clidx\n" unless !$debug;
223 my $tree_model_obj = new ClassifyTreeModel($collection, "CL" . $clidx);
224 # And store it for later
225 push(@classifier_tree_models, $tree_model_obj);
226 }
227 }
228 $clidx++;
229 }
230 # D. If removing or replacing metadata:
231 if (defined($old_value) && $old_value =~ /[\w\d]+/)
232 {
233 print STDERR "* removing '$old_value' from GDBM database\n" unless !$debug;
234 # i/ Call ??? to remove key-value from Lucene index
235 # Moved elsewhere
236 # ii/ Use removeMetadata() to clear value in IncrementalDocument
237 $doc_obj->removeMetadata($key, $old_value);
238 # iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
239 foreach my $classifier_tree_model (@classifier_tree_models)
240 {
241 print STDERR "* removing '$old_value' from classifier tree\n" unless !$debug;
242 $classifier_tree_model->removeDocument($old_value, $oid, 1);
243 }
244 }
245 # E. If adding or replacing metadata:
246 if (defined($new_value) && $new_value =~ /[\w\d]+/)
247 {
248 print STDERR "* adding '$new_value' to GDBM database\n" unless !$debug;
249 # i/ Call ??? to add key-value from Lucene index
250 # Moved elsewhere
251 # ii/ Use addMetadata() to add value in IncrementalDocument
252 $doc_obj->addMetadata($key, $new_value);
253 # iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
254 foreach my $classifier_tree_model (@classifier_tree_models)
255 {
256 print STDERR "* adding '$new_value' to classifier tree\n" unless !$debug;
257 $classifier_tree_model->addDocument($new_value, $oid);
258 }
259 }
260 # F. Complete Lucene transaction
261 if(defined($updateindex) && $updateindex)
262 {
263 print STDERR "* updating Lucene indexes\n" unless !$debug;
264 &callGS2LuceneEditor($collection, $doc_obj->getDocNum, $key, $old_value, $new_value);
265 }
266 # G. Save IncrementalDocument to GDBM
267 $doc_obj->saveDocument();
268 $doc_obj = 0;
269 }
270# /** setDocumentMetadata() **/
271
272# /**
273# *
274# */
275sub callGS2LuceneDelete()
276 {
277 my ($collection, $docnum) = @_;
278
279 # Some path information that is the same for all indexes
280 my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
281 my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneDelete";
282 my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
283 # Determine what indexes need to be changed by opening the collections
284 # index path and searching for directories named *idx
285 # If the directory doesn't exist, then there is no built index, and nothing
286 # for us to do.
287 if(opendir(INDEXDIR, $indexpath))
288 {
289 my @index_files = readdir(INDEXDIR);
290 closedir(INDEXDIR);
291 # For each index that matches or pattern, we call the java application
292 # to change the index (as necessary - not every index will include the
293 # document we have been asked to modify)
294 foreach my $actual_index_dir (@index_files)
295 {
296 next unless $actual_index_dir =~ /idx$/;
297 # Determine the path to the index to modify
298 my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
299 # Call java to remove the document
300 my $cmd = "java -classpath \"$classpath\" $java_lucene --index $full_index_dir --nodeid $docnum";
301 print STDERR "CMD: " . $cmd . "\n" unless !$debug;
302 # Run command
303 $result = `$cmd 2>&1`;
304 print STDERR $result unless !$debug;
305 }
306 }
307 # Done
308 }
309# /** callGS2LuceneDelete() **/
310
311# /**
312# */
313sub callGS2LuceneEditor()
314 {
315 my ($collection, $docnum, $key, $old_value, $new_value) = @_;
316
317 # Some path information that is the same for all indexes
318 my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"java","classes");
319 my $jarpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
320 my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneEditor";
321 my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
322 # And some commands that don't change
323 my $java_args = "";
324 # Append the node id
325 $java_args .= "--nodeid $docnum ";
326 # We have to convert the given metadata key into its two letter field code.
327 # We do this by looking in the build.cfg file.
328 my $field = &getFieldFromBuildCFG($indexpath, $key);
329 # The metadata field to change
330 $java_args .= "--field $field ";
331 # And the old and new values as necessary
332 if(defined($old_value) && $old_value =~ /[\w\d]+/)
333 {
334 $java_args .= "--oldvalue \"$old_value\" ";
335 }
336 if(defined($new_value) && $new_value =~ /[\w\d]+/)
337 {
338 $java_args .= "--newvalue \"$new_value\" ";
339 }
340 # Determine what indexes need to be changed by opening the collections
341 # index path and searching for directories named *idx
342 # If the directory doesn't exist, then there is no built index, and nothing
343 # for us to do.
344 # We also check if the field is something other than "". It is entirely
345 # possible that we have been asked to update a metadata field that isn't
346 # part of any index, so this is where we break out of editing the index if
347 # we have
348 if($field =~ /^\w\w$/ && opendir(INDEXDIR, $indexpath))
349 {
350 my @index_files = readdir(INDEXDIR);
351 closedir(INDEXDIR);
352 # For each index that matches or pattern, we call the java application
353 # to change the index (as necessary - not every index will include the
354 # document we have been asked to modify)
355 foreach my $actual_index_dir (@index_files)
356 {
357 next unless $actual_index_dir =~ /idx$/;
358 # Determine the path to the index to modify
359 my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
360 # And prepend to the command java arguments
361 my $cur_java_args = "--index $full_index_dir " . $java_args;
362 print STDERR "CMD: java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1\n" unless !$debug;
363 # Run command
364 $result = `java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1`;
365 print STDERR $result unless !$debug;
366 }
367 }
368 # Done
369 }
370# /** callGS2LuceneEditor() **/
371
372## Remove a document from the GDBM and Index.
373#
374# @param collection The collection to alter
375# @param oid The unique identifier of the document to be removed
376##
377sub deleteDocument()
378 {
379 my ($collection, $oid) = @_;
380 # Load the incremental document to go with this oid, as we need some
381 # information from it.
382 my $doc_obj = new IncrementalDocument($collection, $oid);
383 $doc_obj->loadDocument();
384 # Check if this object even exists by retrieving the docnum.
385 my $doc_num = $doc_obj->getDocNum();
386 print STDERR "Removing document docnum: $doc_num\n" unless !$debug;
387 if ($doc_num > -1)
388 {
389 # Now use the GDBM utils to write a blank string to this oid in the
390 # database
391 &GDBMUtils::gdbmSet($collection, $oid, "");
392 # Remove reverse lookup
393 &GDBMUtils::gdbmSet($collection, $doc_num, "");
394 # And remove from the database
395 &callGS2LuceneDelete($collection, $doc_num);
396
397 # Regenerate the classifier trees.
398 print STDERR "* load collection configuration\n";# unless !$debug;
399 my $config_obj = &getConfigObj($collection);
400 my $clidx = 1;
401 my %classifier_tree_models = ();
402 foreach my $classifier (@{$config_obj->{'classify'}})
403 {
404 my $index = 0;
405 my $option_count = scalar(@{$classifier});
406 for ($index = 0; $index < $option_count; $index++)
407 {
408 if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata")
409 {
410 my $key = @{$classifier}[$index + 1];
411 # Create a tree model for this classifier
412 print STDERR "* creating a tree model for classifier: CL" . $clidx . " [" . $key . "]\n";# unless !$debug;
413 my $tree_model_obj = new ClassifyTreeModel($collection, "CL" . $clidx);
414 # And store it against its key for later
415 $classifier_tree_models{$key} = $tree_model_obj;
416 }
417 }
418 $clidx++;
419 }
420
421 # For each piece of metadata assigned to this document, if there is a
422 # matching classifier tree, remove the path from the tree.
423 print STDERR "* searching for classifier paths to be removed\n";
424
425 my $metadata = $doc_obj->getAllMetadata($section);
426 foreach my $pair (@$metadata)
427 {
428 my ($key, $value) = @$pair;
429 print STDERR "* testing " . $key . "=>" . $value . "\n";
430 if (defined($classifier_tree_models{$key}))
431 {
432 $model = $classifier_tree_models{$key};
433 print STDERR "* removing '" . $value . "' from classifier " . $model->getRootNode()->getCLID() . "\n";
434 $model->removeDocument($value, $oid, 1);
435 }
436 }
437
438 # We also have to remove from browselist - the reverse process of
439 # adding to browselist shown above.
440 my $dummy_model = new ClassifyTreeModel($collection, "");
441 my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
442 # Add the document
443 $browselist_node->removeDocument($oid);
444 # Clean up
445 }
446 # else, no document, no need to delete.
447 }
448## deleteDocument() ##
449
450# /**
451# */
452sub getFieldFromBuildCFG()
453 {
454 my ($indexpath, $key) = @_;
455 my $field = "";
456 my $build_cfg = &util::filename_cat($indexpath, "build.cfg");
457 # If there isn't a build.cfg then the index hasn't been built and there is
458 # nothing to do
459 if(open(BUILDCFG, $build_cfg))
460 {
461 # For each line of the build configuration
462 my $line;
463 while($line = <BUILDCFG>)
464 {
465 # Only interested in the indexfieldmap line
466 if($line =~ /^indexfieldmap\s+/)
467 {
468 # Extract the field information by looking up the key pair
469 if($line =~ /\s$key->(\w\w)/)
470 {
471 $field = $1;
472 }
473 }
474 }
475 # Done with file
476 close(BUILDCFG);
477 }
478 # Return whatever we found
479 return $field;
480 }
481# /** getFieldFromBuildCFG() **/
482
483
484
485
486
487# /** Retrieve an object (associative array) containing information about the
488# * collection configuration.
489# * @param $collection The shortname of the collection as a string
490# * @return An associative array containing information from the collect.cfg
491# * @author John Thompson, DL Consulting Ltd.
492# */
493sub getConfigObj()
494 {
495 my ($collection) = @_;
496
497 #rint STDERR "getConfigObj()\n" unless !$debug;
498
499 my $colcfgname = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "etc", "collect.cfg");
500 if (!-e $colcfgname)
501 {
502 die "incremental_build - couldn't find collect.cfg for collection $collection\n";
503 }
504 return &colcfg::read_collect_cfg ($colcfgname);
505 }
506# /** getConfigObj() **/
507
5081;
Note: See TracBrowser for help on using the repository browser.