source: main/trunk/greenstone2/perllib/IncrementalBuildUtils.pm@ 32222

Last change on this file since 32222 was 31973, checked in by ak19, 7 years ago

Related to previous commit which contained Dr Bainbridge's fix to bug noticed just now: IncrementalBuildUtils determines the PATH separator based on whether the OS name contained the win substring for Windows. However, Mac OS is called darwin and contains the win substring too, so Dr Bainbridge fixed the test to check for win as OS name prefix. This part of the code will need to be reworked in future, such as to use util's envvar_prepend and append functions instead of concatenating something that may already be on the PATH. Also, check for any existing methods to test if OS is windows or not to put it all in one place.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.3 KB
Line 
1###########################################################################
2#
3# IncrementalBuildUtils.pm -- API to assist incremental building
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25# /** Initial versions of these functions by John Thompson, revisions by
26# * and turning it into a package by John Rowe. Used heavily by
27# * basebuilder::remove_document() and getdocument.pl
28# *
29# * @version 1.0 Initial version by John Thompson
30# * @version 1.1 Addition of get_document and change of get_document_as_xml
31# * by John Rowe
32# * @version 2.0 Package version including seperation from calling code and
33# * modularisation by John Rowe
34# *
35# * @author John Thompson, DL Consulting Ltd.
36# * @author John Rowe, DL Consulting Ltd.
37# */
38###########################################################################
39package IncrementalBuildUtils;
40
41BEGIN {
42 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
43 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
44
45 # - ensure we only add perllib paths to INC if they weren't already there
46 # as otherwise we lose the ability to use order in INC as a guide for
47 # inheritence/overriding [jmt12]
48 my $gsdl_perllib_path = $ENV{'GSDLHOME'} . '/perllib';
49 my $found_path = 0;
50 foreach my $inc_path (@INC)
51 {
52 if ($inc_path eq $gsdl_perllib_path)
53 {
54 $found_path = 1;
55 last;
56 }
57 }
58 if (!$found_path)
59 {
60 unshift (@INC, $gsdl_perllib_path);
61 unshift (@INC, $gsdl_perllib_path . '/cpan');
62 unshift (@INC, $gsdl_perllib_path . '/plugins');
63 unshift (@INC, $gsdl_perllib_path . '/classify');
64 }
65}
66
67use doc;
68use cfgread;
69use colcfg;
70use strict;
71use util;
72
73use ClassifyTreeModel;
74use IncrementalDocument;
75
76# Change debugging to 1 if you want verbose debugging output
77my $debug = 1;
78
79# Ensure the collection specific binaries are on the search path
80my $path_separator = ":";
81if($ENV{'GSDLOS'} =~ /^win/) { # beware to check that it starts with "win" for windows, since darwin also contains "win" but path separator for that should be :
82 $path_separator = ";";
83}
84# - once again we need to ensure we aren't duplicating paths on the environment
85# otherwise things like extension executables won't be correctly used in
86# preference to main Greenstone ones [jmt12]
87my @env_path = split($path_separator, $ENV{'PATH'});
88my $os_binary_path = &util::filename_cat($ENV{'GSDLHOME'}, 'bin', $ENV{'GSDLOS'});
89my $script_path = &util::filename_cat($ENV{'GSDLHOME'}, 'bin', 'script');
90my $found_os_bin = 0;
91foreach my $path (@env_path)
92{
93 if ($path eq $os_binary_path)
94 {
95 $found_os_bin = 1;
96 last;
97 }
98}
99if (!$found_os_bin)
100{
101 $ENV{'PATH'} = $os_binary_path . $path_separator . $script_path . $path_separator . $ENV{'PATH'};
102}
103
104
105# /**
106# */
107sub addDocument()
108 {
109 my ($collection, $infodbtype, $doc_obj, $section, $updateindex) = @_;
110
111 $updateindex = 0 unless defined($updateindex);
112
113 print STDERR "IncrementalBuildUtils::addDocument('$collection',$infodbtype,$doc_obj,'$section')\n" unless !$debug;
114 # Gonna need to know in several places whether this is the top section
115 # of the document or not
116 my $is_top = ($section eq $doc_obj->get_top_section());
117
118 # Retrieve all of the metadata from this document object only - not any
119 # child documents
120 my $metadata = $doc_obj->get_all_metadata($section);
121 # Check and add the docnum first
122 my $found_docnum = 0;
123 foreach my $pair (@$metadata)
124 {
125 my ($key, $value) = (@$pair);
126 if ($key eq "docnum")
127 {
128 &setDocumentMetadata($collection, $infodbtype, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
129 $found_docnum = 1;
130 }
131 }
132
133 if (!$found_docnum)
134 {
135 die("Fatal Error! Tried to add document without providing docnum");
136 }
137
138 # Add it piece by piece - this depends on the loading of a blank document
139 # working the way it should.
140 foreach my $pair (@$metadata)
141 {
142 my ($key, $value) = (@$pair);
143 if ($key ne "Identifier" && $key ne "docnum" && $key !~ /^gsdl/ && defined $value && $value ne "")
144 {
145 # escape problematic stuff
146 $value =~ s/\\/\\\\/g;
147 $value =~ s/\n/\\n/g;
148 $value =~ s/\r/\\r/g;
149 if ($value =~ /-{70,}/)
150 {
151 # if value contains 70 or more hyphens in a row we need
152 # to escape them to prevent txt2db from treating them
153 # as a separator
154 $value =~ s/-/&\#045;/gi;
155 }
156 # Go ahead and set the metadata
157 &setDocumentMetadata($collection, $infodbtype, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
158 }
159 }
160 # We now have to load the browselist node too. We create a ClassifyTreeNode
161 # based on a dummy model.
162 # Note: only if section is the top section
163 if ($is_top)
164 {
165 my $dummy_model = new ClassifyTreeModel($collection, $infodbtype, "");
166 my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
167 # Add the document
168 $browselist_node->addDocument($doc_obj->get_OID());
169 }
170 # We now recursively move through the document objects child sections,
171 # adding them too. As we do this we build up a contains list for this
172 # document.
173 my $section_ptr = $doc_obj->_lookup_section($section);
174 my @contains = ();
175 if (defined $section_ptr)
176 {
177 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
178 &addDocument($collection, $infodbtype, $doc_obj, "$section.$subsection");
179 push(@contains, "\".$subsection");
180 }
181 }
182 # Done - clean up
183 }
184# /** addDocument() **/
185
186# /** Sets the metadata attached to a given document. This will update, at most,
187# * three different locations:
188# * 1. The Lucene index must be updated. This will involve removing any
189# * existing value and, if required, adding a new value in its place.
190# * 2. The info database must be updated. Again any existing value will be
191# * removed and, if required, a new value added.
192# * 3. Finally a check against the collect.cfg will be done to determine if
193# * the changed metadata would have an effect on a classifier and, if so
194# * the classifier tree will be updated to remove, add or replace any
195# * tree nodes or node 'contains lists' as necessary.
196# *
197# * Pseudo Code:
198# * ------------
199# * To add metadata to the document NT1
200# * A. Establish connection to Lucene
201# * B. Create a IncrementalDocument object for 'NT1' loading the information
202# * from the info database
203# * C. Check to see if this metadata is used to build a classifier(s) and if
204# * so create the appropriate ClassifyTreeModel(s)
205# * D. If removing or replacing metadata:
206# * i/ Call ??? to remove key-value from Lucene index
207# * ii/ Use removeMetadata() to clear value in IncrementalDocument
208# * iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
209# * E. If adding or replacing metadata:
210# * i/ Call ??? to add key-value from Lucene index
211# * ii/ Use addMetadata() to add value in IncrementalDocument
212# * iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
213# * F. Complete Lucene transaction
214# * G. Save IncrementalDocument to info database
215# * Note: ClassifyTreeModel automatically updates the info database as necessary.
216# *
217# * @param $collection The name of the collection to update as a string
218# * @param $oid The unique identifier of a Greenstone document as a
219# * string
220# * @param $key The key of the metadata being added as a string
221# * @param $old_value The value of the metadata being removed/replaced
222# * or an empty string if adding metadata
223# * @param $new_value The value of the metadata being added/replacing
224# * or an empty string if removing metadata
225# * @param $updateindex 1 to get the index updated. This is used to prevent
226# * the indexes being changed when doing an incremental
227# * addition of a new document.
228# *
229# * @author John Thompson, DL Consulting Ltd.
230# */
231sub setDocumentMetadata()
232 {
233 my ($collection, $infodbtype, $oid, $key, $old_value, $new_value, $updateindex) = @_;
234 print STDERR "IncrementalBuildUtils::setDocumentMetadata('$collection',$infodbtype,'$oid','$key','$old_value','$new_value',$updateindex)\n" unless !$debug;
235 # A. Establish connection to Lucene
236 # This isn't required at the moment, but might be later if we implement
237 # Lucene daemon.
238 # B. Create a IncrementalDocument object for 'NT1' loading the information
239 # from the info database
240 print STDERR "* creating incremental document for $oid\n" unless !$debug;
241 my $doc_obj = new IncrementalDocument($collection, $infodbtype, $oid);
242 $doc_obj->loadDocument();
243 # C. Check to see if this metadata is used to build a classifier(s) and if
244 # so create the appropriate ClassifyTreeModel(s)
245 print STDERR "* load collection configuration\n" unless !$debug;
246 my $config_obj = &getConfigObj($collection);
247 my $clidx = 1;
248 my @classifier_tree_models = ();
249 foreach my $classifier (@{$config_obj->{'classify'}})
250 {
251 my $index = 0;
252 my $option_count = scalar(@{$classifier});
253 for ($index = 0; $index < $option_count; $index++)
254 {
255 if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata" && @{$classifier}[$index + 1] eq $key)
256 {
257 # Create a tree model for this classifier
258 print STDERR "* creating a tree model for classifier: CL$clidx\n" unless !$debug;
259 my $tree_model_obj = new ClassifyTreeModel($collection, $infodbtype, "CL" . $clidx);
260 # And store it for later
261 push(@classifier_tree_models, $tree_model_obj);
262 }
263 }
264 $clidx++;
265 }
266 # D. If removing or replacing metadata:
267 if (defined($old_value) && $old_value =~ /[\w\d]+/)
268 {
269 print STDERR "* removing '$key'='$old_value' from info database for document $oid\n" unless !$debug;
270 # i/ Call ??? to remove key-value from Lucene index
271 # Moved elsewhere
272 # ii/ Use removeMetadata() to clear value in IncrementalDocument
273 $doc_obj->removeMetadata($key, $old_value);
274 # iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
275 foreach my $classifier_tree_model (@classifier_tree_models)
276 {
277 print STDERR "* removing '$old_value' from classifier tree\n" unless !$debug;
278 $classifier_tree_model->removeDocument($old_value, $oid, 1);
279 }
280 }
281 # E. If adding or replacing metadata:
282 if (defined($new_value) && $new_value =~ /[\w\d]+/)
283 {
284 print STDERR "* adding '$key'='$new_value' to info database for document $oid\n" unless !$debug;
285 # i/ Call ??? to add key-value from Lucene index
286 # Moved elsewhere
287 # ii/ Use addMetadata() to add value in IncrementalDocument
288 $doc_obj->addMetadata($key, $new_value);
289 # iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
290 foreach my $classifier_tree_model (@classifier_tree_models)
291 {
292 print STDERR "* adding '$new_value' to classifier tree\n" unless !$debug;
293 $classifier_tree_model->addDocument($new_value, $oid);
294 }
295 }
296 # F. Complete Lucene transaction
297 if(defined($updateindex) && $updateindex)
298 {
299 print STDERR "* updating Lucene indexes\n" unless !$debug;
300 &callGS2LuceneEditor($collection, $doc_obj->getDocNum, $key, $old_value, $new_value);
301 }
302 # G. Save IncrementalDocument to info database
303 $doc_obj->saveDocument();
304 $doc_obj = 0;
305 }
306# /** setDocumentMetadata() **/
307
308# /**
309# *
310# */
311sub callGS2LuceneDelete()
312 {
313 my ($collection, $docnum) = @_;
314
315 # Some path information that is the same for all indexes
316 my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
317 my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneDelete";
318 my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
319 # Determine what indexes need to be changed by opening the collections
320 # index path and searching for directories named *idx
321 # If the directory doesn't exist, then there is no built index, and nothing
322 # for us to do.
323 if(opendir(INDEXDIR, $indexpath))
324 {
325 my @index_files = readdir(INDEXDIR);
326 closedir(INDEXDIR);
327 # For each index that matches or pattern, we call the java application
328 # to change the index (as necessary - not every index will include the
329 # document we have been asked to modify)
330 foreach my $actual_index_dir (@index_files)
331 {
332 next unless $actual_index_dir =~ /idx$/;
333 # Determine the path to the index to modify
334 my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
335 # Call java to remove the document
336 my $cmd = "java -classpath \"$classpath\" $java_lucene --index $full_index_dir --nodeid $docnum";
337 print STDERR "CMD: " . $cmd . "\n" unless !$debug;
338 # Run command
339 my $result = `$cmd 2>&1`;
340 print STDERR $result unless !$debug;
341 }
342 }
343 # Done
344 }
345# /** callGS2LuceneDelete() **/
346
347# /**
348# */
349sub callGS2LuceneEditor()
350 {
351 my ($collection, $docnum, $key, $old_value, $new_value) = @_;
352
353 # Some path information that is the same for all indexes
354 my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"java","classes");
355 my $jarpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
356 my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneEditor";
357 my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
358 # And some commands that don't change
359 my $java_args = "";
360 # Append the node id
361 $java_args .= "--nodeid $docnum ";
362 # We have to convert the given metadata key into its two letter field code.
363 # We do this by looking in the build.cfg file.
364 my $field = &getFieldFromBuildCFG($indexpath, $key);
365 # The metadata field to change
366 $java_args .= "--field $field ";
367 # And the old and new values as necessary
368 if(defined($old_value) && $old_value =~ /[\w\d]+/)
369 {
370 $java_args .= "--oldvalue \"$old_value\" ";
371 }
372 if(defined($new_value) && $new_value =~ /[\w\d]+/)
373 {
374 $java_args .= "--newvalue \"$new_value\" ";
375 }
376 # Determine what indexes need to be changed by opening the collections
377 # index path and searching for directories named *idx
378 # If the directory doesn't exist, then there is no built index, and nothing
379 # for us to do.
380 # We also check if the field is something other than "". It is entirely
381 # possible that we have been asked to update a metadata field that isn't
382 # part of any index, so this is where we break out of editing the index if
383 # we have
384 if($field =~ /^\w\w$/ && opendir(INDEXDIR, $indexpath))
385 {
386 my @index_files = readdir(INDEXDIR);
387 closedir(INDEXDIR);
388 # For each index that matches or pattern, we call the java application
389 # to change the index (as necessary - not every index will include the
390 # document we have been asked to modify)
391 foreach my $actual_index_dir (@index_files)
392 {
393 next unless $actual_index_dir =~ /idx$/;
394 # Determine the path to the index to modify
395 my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
396 # And prepend to the command java arguments
397 my $cur_java_args = "--index $full_index_dir " . $java_args;
398 print STDERR "CMD: java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1\n" unless !$debug;
399 # Run command
400 my $result = `java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1`;
401 print STDERR $result unless !$debug;
402 }
403 }
404 # Done
405 }
406# /** callGS2LuceneEditor() **/
407
408## Remove a document from the info database and Index.
409#
410# @param collection The collection to alter
411# @param oid The unique identifier of the document to be removed
412##
413sub deleteDocument()
414 {
415 my ($collection, $infodbtype, $oid) = @_;
416 # Load the incremental document to go with this oid, as we need some
417 # information from it.
418 my $doc_obj = new IncrementalDocument($collection, $infodbtype, $oid);
419 $doc_obj->loadDocument();
420 # Check if this object even exists by retrieving the docnum.
421 my $doc_num = $doc_obj->getDocNum();
422 print STDERR "Removing document docnum: $doc_num\n" unless !$debug;
423 if ($doc_num > -1)
424 {
425 # Now write a blank string to this oid in the info database
426 my $index_text_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text");
427 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collection, $index_text_directory_path);
428 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $infodb_file_path, "append");
429 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, &dbutil::convert_infodb_string_to_hash(""));
430 # Remove reverse lookup
431 &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $doc_num, &dbutil::convert_infodb_string_to_hash(""));
432 &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
433
434 # And remove from the database
435 &callGS2LuceneDelete($collection, $doc_num);
436
437 # Regenerate the classifier trees.
438 print STDERR "* load collection configuration\n";# unless !$debug;
439 my $config_obj = &getConfigObj($collection);
440 my $clidx = 1;
441 my %classifier_tree_models = ();
442 foreach my $classifier (@{$config_obj->{'classify'}})
443 {
444 my $index = 0;
445 my $option_count = scalar(@{$classifier});
446 for ($index = 0; $index < $option_count; $index++)
447 {
448 if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata")
449 {
450 my $key = @{$classifier}[$index + 1];
451 # Create a tree model for this classifier
452 print STDERR "* creating a tree model for classifier: CL" . $clidx . " [" . $key . "]\n";# unless !$debug;
453 my $tree_model_obj = new ClassifyTreeModel($collection, $infodbtype, "CL" . $clidx);
454 # And store it against its key for later
455 $classifier_tree_models{$key} = $tree_model_obj;
456 }
457 }
458 $clidx++;
459 }
460
461 # For each piece of metadata assigned to this document, if there is a
462 # matching classifier tree, remove the path from the tree.
463 print STDERR "* searching for classifier paths to be removed\n";
464
465 my $metadata = $doc_obj->getAllMetadata();
466 foreach my $pair (@$metadata)
467 {
468 my ($key, $value) = @$pair;
469 print STDERR "* testing " . $key . "=>" . $value . "\n";
470 if (defined($classifier_tree_models{$key}))
471 {
472 my $model = $classifier_tree_models{$key};
473 print STDERR "* removing '" . $value . "' from classifier " . $model->getRootNode()->getCLID() . "\n";
474 $model->removeDocument($value, $oid, 1);
475 }
476 }
477
478 # We also have to remove from browselist - the reverse process of
479 # adding to browselist shown above.
480 my $dummy_model = new ClassifyTreeModel($collection, $infodbtype, "");
481 my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
482 # Add the document
483 $browselist_node->removeDocument($oid);
484 # Clean up
485 }
486 # else, no document, no need to delete.
487 }
488## deleteDocument() ##
489
490# /**
491# */
492sub getFieldFromBuildCFG()
493 {
494 my ($indexpath, $key) = @_;
495 my $field = "";
496 my $build_cfg = &util::filename_cat($indexpath, "build.cfg");
497 # If there isn't a build.cfg then the index hasn't been built and there is
498 # nothing to do
499 if(open(BUILDCFG, $build_cfg))
500 {
501 # For each line of the build configuration
502 my $line;
503 while($line = <BUILDCFG>)
504 {
505 # Only interested in the indexfieldmap line
506 if($line =~ /^indexfieldmap\s+/)
507 {
508 # Extract the field information by looking up the key pair
509 if($line =~ /\s$key->(\w\w)/)
510 {
511 $field = $1;
512 }
513 }
514 }
515 # Done with file
516 close(BUILDCFG);
517 }
518 # Return whatever we found
519 return $field;
520 }
521# /** getFieldFromBuildCFG() **/
522
523
524
525
526
527# /** Retrieve an object (associative array) containing information about the
528# * collection configuration.
529# * @param $collection The shortname of the collection as a string
530# * @return An associative array containing information from the collect.cfg
531# * @author John Thompson, DL Consulting Ltd.
532# */
533sub getConfigObj()
534 {
535 my ($collection) = @_;
536
537 #rint STDERR "getConfigObj()\n" unless !$debug;
538
539 my $colcfgname = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "etc", "collect.cfg");
540 if (!-e $colcfgname)
541 {
542 die "IncrementalBuildUtils - couldn't find collect.cfg for collection $collection\n";
543 }
544 return &colcfg::read_collect_cfg ($colcfgname);
545 }
546# /** getConfigObj() **/
547
5481;
Note: See TracBrowser for help on using the repository browser.