root/main/trunk/greenstone2/perllib/IncrementalBuildUtils.pm @ 31973

Revision 31973, 22.3 KB (checked in by ak19, 3 years ago)

Related to previous commit which contained Dr Bainbridge's fix to bug noticed just now: IncrementalBuildUtils? determines the PATH separator based on whether the OS name contained the win substring for Windows. However, Mac OS is called darwin and contains the win substring too, so Dr Bainbridge fixed the test to check for win as OS name prefix. This part of the code will need to be reworked in future, such as to use util's envvar_prepend and append functions instead of concatenating something that may already be on the PATH. Also, check for any existing methods to test if OS is windows or not to put it all in one place.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# IncrementalBuildUtils.pm -- API to assist incremental building
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25# /** Initial versions of these functions by John Thompson, revisions by
26#  *  and turning it into a package by John Rowe. Used heavily by
27#  *  basebuilder::remove_document() and getdocument.pl
28#  *
29#  *  @version 1.0 Initial version by John Thompson
30#  *  @version 1.1 Addition of get_document and change of get_document_as_xml
31#  *               by John Rowe
32#  *  @version 2.0 Package version including seperation from calling code and
33#  *               modularisation by John Rowe
34#  *
35#  *  @author John Thompson, DL Consulting Ltd.
36#  *  @author John Rowe, DL Consulting Ltd.
37#  */
38###########################################################################
39package IncrementalBuildUtils;
40
41BEGIN {
42    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
43    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
44
45    # - ensure we only add perllib paths to INC if they weren't already there
46    # as otherwise we lose the ability to use order in INC as a guide for
47    # inheritence/overriding [jmt12]
48    my $gsdl_perllib_path = $ENV{'GSDLHOME'} . '/perllib';
49    my $found_path = 0;
50    foreach my $inc_path (@INC)
51    {
52      if ($inc_path eq $gsdl_perllib_path)
53      {
54        $found_path = 1;
55        last;
56      }
57    }
58    if (!$found_path)
59    {
60      unshift (@INC, $gsdl_perllib_path);
61      unshift (@INC, $gsdl_perllib_path . '/cpan');
62      unshift (@INC, $gsdl_perllib_path . '/plugins');
63      unshift (@INC, $gsdl_perllib_path . '/classify');
64    }
65}
66
67use doc;
68use cfgread;
69use colcfg;
70use strict;
71use util;
72
73use ClassifyTreeModel;
74use IncrementalDocument;
75
76# Change debugging to 1 if you want verbose debugging output
77my $debug = 1;
78
79# Ensure the collection specific binaries are on the search path
80my $path_separator = ":";
81if($ENV{'GSDLOS'} =~ /^win/) { # beware to check that it starts with "win" for windows, since darwin also contains "win" but path separator for that should be :
82  $path_separator = ";";
83}
84# - once again we need to ensure we aren't duplicating paths on the environment
85# otherwise things like extension executables won't be correctly used in
86# preference to main Greenstone ones [jmt12]
87my @env_path = split($path_separator, $ENV{'PATH'});
88my $os_binary_path = &util::filename_cat($ENV{'GSDLHOME'}, 'bin', $ENV{'GSDLOS'});
89my $script_path = &util::filename_cat($ENV{'GSDLHOME'}, 'bin', 'script');
90my $found_os_bin = 0;
91foreach my $path (@env_path)
92{
93  if ($path eq $os_binary_path)
94  {
95    $found_os_bin = 1;
96    last;
97  }
98}
99if (!$found_os_bin)
100{
101  $ENV{'PATH'} = $os_binary_path . $path_separator . $script_path . $path_separator . $ENV{'PATH'};
102}
103
104
105# /**
106#  */
107sub addDocument()
108  {
109    my ($collection, $infodbtype, $doc_obj, $section, $updateindex) = @_;
110
111    $updateindex = 0 unless defined($updateindex);
112
113    print STDERR "IncrementalBuildUtils::addDocument('$collection',$infodbtype,$doc_obj,'$section')\n" unless !$debug;
114    # Gonna need to know in several places whether this is the top section
115    # of the document or not
116    my $is_top = ($section eq $doc_obj->get_top_section());
117
118    # Retrieve all of the metadata from this document object only - not any
119    # child documents
120    my $metadata = $doc_obj->get_all_metadata($section);
121    # Check and add the docnum first
122    my $found_docnum = 0;
123    foreach my $pair (@$metadata)
124      {
125        my ($key, $value) = (@$pair);
126        if ($key eq "docnum")
127          {
128            &setDocumentMetadata($collection, $infodbtype, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
129            $found_docnum = 1;
130          }
131      }
132
133    if (!$found_docnum)
134      {
135        die("Fatal Error! Tried to add document without providing docnum");
136      }
137
138    # Add it piece by piece - this depends on the loading of a blank document
139    # working the way it should.
140    foreach my $pair (@$metadata)
141      {
142        my ($key, $value) = (@$pair);
143        if ($key ne "Identifier" && $key ne "docnum" && $key !~ /^gsdl/ && defined $value && $value ne "")
144          {
145            # escape problematic stuff
146            $value =~ s/\\/\\\\/g;
147            $value =~ s/\n/\\n/g;
148            $value =~ s/\r/\\r/g;
149            if ($value =~ /-{70,}/)
150              {
151                # if value contains 70 or more hyphens in a row we need
152                # to escape them to prevent txt2db from treating them
153                # as a separator
154                $value =~ s/-/&\#045;/gi;
155              }
156            # Go ahead and set the metadata
157            &setDocumentMetadata($collection, $infodbtype, $doc_obj->get_OID() . "$section", $key, "", $value, $updateindex);
158          }
159      }
160    # We now have to load the browselist node too. We create a ClassifyTreeNode
161    # based on a dummy model.
162    # Note: only if section is the top section
163    if ($is_top)
164      {
165        my $dummy_model = new ClassifyTreeModel($collection, $infodbtype, "");
166        my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
167        # Add the document
168        $browselist_node->addDocument($doc_obj->get_OID());
169      }
170    # We now recursively move through the document objects child sections,
171    # adding them too. As we do this we build up a contains list for this
172    # document.
173    my $section_ptr = $doc_obj->_lookup_section($section);
174    my @contains = ();
175    if (defined $section_ptr)
176      {
177        foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
178          &addDocument($collection, $infodbtype, $doc_obj, "$section.$subsection");
179          push(@contains, "\".$subsection");
180        }
181      }
182    # Done - clean up
183  }
184# /** addDocument() **/
185
186# /** Sets the metadata attached to a given document. This will update, at most,
187#  *  three different locations:
188#  *  1. The Lucene index must be updated. This will involve removing any
189#  *     existing value and, if required, adding a new value in its place.
190#  *  2. The info database must be updated. Again any existing value will be
191#  *     removed and, if required, a new value added.
192#  *  3. Finally a check against the collect.cfg will be done to determine if
193#  *     the changed metadata would have an effect on a classifier and, if so
194#  *     the classifier tree will be updated to remove, add or replace any
195#  *     tree nodes or node 'contains lists' as necessary.
196#  *
197#  *  Pseudo Code:
198#  *  ------------
199#  *  To add metadata to the document NT1
200#  *  A. Establish connection to Lucene
201#  *  B. Create a IncrementalDocument object for 'NT1' loading the information
202#  *     from the info database
203#  *  C. Check to see if this metadata is used to build a classifier(s) and if
204#  *     so create the appropriate ClassifyTreeModel(s)
205#  *  D. If removing or replacing metadata:
206#  *     i/   Call ??? to remove key-value from Lucene index
207#  *     ii/  Use removeMetadata() to clear value in IncrementalDocument
208#  *     iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
209#  *  E. If adding or replacing metadata:
210#  *     i/   Call ??? to add key-value from Lucene index
211#  *     ii/ Use addMetadata() to add value in IncrementalDocument
212#  *     iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
213#  *  F. Complete Lucene transaction
214#  *  G. Save IncrementalDocument to info database
215#  *  Note: ClassifyTreeModel automatically updates the info database as necessary.
216#  *
217#  *  @param  $collection  The name of the collection to update as a string
218#  *  @param  $oid         The unique identifier of a Greenstone document as a
219#  *                       string
220#  *  @param  $key         The key of the metadata being added as a string
221#  *  @param  $old_value   The value of the metadata being removed/replaced
222#  *                       or an empty string if adding metadata
223#  *  @param  $new_value   The value of the metadata being added/replacing
224#  *                       or an empty string if removing metadata
225#  *  @param  $updateindex 1 to get the index updated. This is used to prevent
226#  *                       the indexes being changed when doing an incremental
227#  *                       addition of a new document.
228#  *
229#  *  @author John Thompson, DL Consulting Ltd.
230#  */
231sub setDocumentMetadata()
232  {
233    my ($collection, $infodbtype, $oid, $key, $old_value, $new_value, $updateindex) = @_;
234    print STDERR "IncrementalBuildUtils::setDocumentMetadata('$collection',$infodbtype,'$oid','$key','$old_value','$new_value',$updateindex)\n" unless !$debug;
235    # A. Establish connection to Lucene
236    #    This isn't required at the moment, but might be later if we implement
237    #    Lucene daemon.
238    # B. Create a IncrementalDocument object for 'NT1' loading the information
239    #    from the info database
240    print STDERR "* creating incremental document for $oid\n" unless !$debug;
241    my $doc_obj = new IncrementalDocument($collection, $infodbtype, $oid);
242    $doc_obj->loadDocument();
243    # C. Check to see if this metadata is used to build a classifier(s) and if
244    #    so create the appropriate ClassifyTreeModel(s)
245    print STDERR "* load collection configuration\n" unless !$debug;
246    my $config_obj = &getConfigObj($collection);
247    my $clidx = 1;
248    my @classifier_tree_models = ();
249    foreach my $classifier (@{$config_obj->{'classify'}})
250      {
251        my $index = 0;
252        my $option_count = scalar(@{$classifier});
253        for ($index = 0; $index < $option_count; $index++)
254          {
255            if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata" && @{$classifier}[$index + 1] eq $key)
256              {
257                # Create a tree model for this classifier
258                print STDERR "* creating a tree model for classifier: CL$clidx\n" unless !$debug;
259                my $tree_model_obj = new ClassifyTreeModel($collection, $infodbtype, "CL" . $clidx);
260                # And store it for later
261                push(@classifier_tree_models, $tree_model_obj);
262              }
263          }
264        $clidx++;
265      }
266    # D. If removing or replacing metadata:
267    if (defined($old_value) && $old_value =~ /[\w\d]+/)
268      {
269        print STDERR "* removing '$key'='$old_value' from info database for document $oid\n" unless !$debug;
270        # i/   Call ??? to remove key-value from Lucene index
271        #      Moved elsewhere
272        # ii/  Use removeMetadata() to clear value in IncrementalDocument
273        $doc_obj->removeMetadata($key, $old_value);
274        # iii/ Call removeDocument() in ClassifyTreeModel(s) as necessary
275        foreach my $classifier_tree_model (@classifier_tree_models)
276          {
277            print STDERR "* removing '$old_value' from classifier tree\n" unless !$debug;
278            $classifier_tree_model->removeDocument($old_value, $oid, 1);
279          }
280      }
281    # E. If adding or replacing metadata:
282    if (defined($new_value) && $new_value =~ /[\w\d]+/)
283      {
284        print STDERR "* adding '$key'='$new_value' to info database for document $oid\n" unless !$debug;
285        # i/   Call ??? to add key-value from Lucene index
286        #      Moved elsewhere
287        # ii/ Use addMetadata() to add value in IncrementalDocument
288        $doc_obj->addMetadata($key, $new_value);
289        # iii/ Call addDocument() in ClassifyTreeModel(s) as necessary
290        foreach my $classifier_tree_model (@classifier_tree_models)
291          {
292            print STDERR "* adding '$new_value' to classifier tree\n" unless !$debug;
293            $classifier_tree_model->addDocument($new_value, $oid);
294          }
295      }
296    # F. Complete Lucene transaction
297    if(defined($updateindex) && $updateindex)
298      {
299        print STDERR "* updating Lucene indexes\n" unless !$debug;
300        &callGS2LuceneEditor($collection, $doc_obj->getDocNum, $key, $old_value, $new_value);
301      }
302    # G. Save IncrementalDocument to info database
303    $doc_obj->saveDocument();
304    $doc_obj = 0;
305  }
306# /** setDocumentMetadata() **/
307
308# /**
309#  *
310#  */
311sub callGS2LuceneDelete()
312  {
313    my ($collection, $docnum) = @_;
314
315    # Some path information that is the same for all indexes
316    my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
317    my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneDelete";
318    my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
319    # Determine what indexes need to be changed by opening the collections
320    # index path and searching for directories named *idx
321    # If the directory doesn't exist, then there is no built index, and nothing
322    # for us to do.
323    if(opendir(INDEXDIR, $indexpath))
324      {
325        my @index_files = readdir(INDEXDIR);
326        closedir(INDEXDIR);
327        # For each index that matches or pattern, we call the java application
328        # to change the index (as necessary - not every index will include the
329        # document we have been asked to modify)
330        foreach my $actual_index_dir (@index_files)
331          {
332            next unless $actual_index_dir =~ /idx$/;
333            # Determine the path to the index to modify
334            my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
335            # Call java to remove the document
336            my $cmd = "java -classpath \"$classpath\" $java_lucene --index $full_index_dir --nodeid $docnum";
337            print STDERR "CMD: " . $cmd . "\n" unless !$debug;
338            # Run command
339            my $result = `$cmd 2>&1`;
340            print STDERR $result unless !$debug;
341          }
342      }
343    # Done
344  }
345# /** callGS2LuceneDelete() **/
346
347# /**
348#  */
349sub callGS2LuceneEditor()
350  {
351    my ($collection, $docnum, $key, $old_value, $new_value) = @_;
352
353    # Some path information that is the same for all indexes
354    my $classpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"java","classes");
355    my $jarpath = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrap.jar");
356    my $java_lucene = "org.nzdl.gsdl.LuceneWrap.GS2LuceneEditor";
357    my $indexpath = &util::filename_cat($ENV{'GSDLHOME'},"collect",$collection,"index");
358    # And some commands that don't change
359    my $java_args = "";
360    # Append the node id
361    $java_args .= "--nodeid $docnum ";
362    # We have to convert the given metadata key into its two letter field code.
363    # We do this by looking in the build.cfg file.
364    my $field = &getFieldFromBuildCFG($indexpath, $key);
365    # The metadata field to change
366    $java_args .= "--field $field ";
367    # And the old and new values as necessary
368    if(defined($old_value) && $old_value =~ /[\w\d]+/)
369      {
370        $java_args .= "--oldvalue \"$old_value\" ";
371      }
372    if(defined($new_value) && $new_value =~ /[\w\d]+/)
373      {
374        $java_args .= "--newvalue \"$new_value\" ";
375      }
376    # Determine what indexes need to be changed by opening the collections
377    # index path and searching for directories named *idx
378    # If the directory doesn't exist, then there is no built index, and nothing
379    # for us to do.
380    # We also check if the field is something other than "". It is entirely
381    # possible that we have been asked to update a metadata field that isn't
382    # part of any index, so this is where we break out of editing the index if
383    # we have
384    if($field =~ /^\w\w$/ && opendir(INDEXDIR, $indexpath))
385      {
386        my @index_files = readdir(INDEXDIR);
387        closedir(INDEXDIR);
388        # For each index that matches or pattern, we call the java application
389        # to change the index (as necessary - not every index will include the
390        # document we have been asked to modify)
391        foreach my $actual_index_dir (@index_files)
392          {
393            next unless $actual_index_dir =~ /idx$/;
394            # Determine the path to the index to modify
395            my $full_index_dir = &util::filename_cat($indexpath, $actual_index_dir);
396            # And prepend to the command java arguments
397            my $cur_java_args = "--index $full_index_dir " . $java_args;
398            print STDERR "CMD: java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1\n" unless !$debug;
399            # Run command
400            my $result = `java -classpath \"$classpath:$jarpath\" $java_lucene $cur_java_args 2>&1`;
401            print STDERR $result unless !$debug;
402          }
403      }
404    # Done
405  }
406# /** callGS2LuceneEditor() **/
407
408## Remove a document from the info database and Index.
409#
410#  @param  collection  The collection to alter
411#  @param  oid         The unique identifier of the document to be removed
412##
413sub deleteDocument()
414  {
415    my ($collection, $infodbtype, $oid) = @_;
416    # Load the incremental document to go with this oid, as we need some
417    # information from it.
418    my $doc_obj = new IncrementalDocument($collection, $infodbtype, $oid);
419    $doc_obj->loadDocument();
420    # Check if this object even exists by retrieving the docnum.
421    my $doc_num = $doc_obj->getDocNum();
422    print STDERR "Removing document docnum: $doc_num\n" unless !$debug;
423    if ($doc_num > -1)
424      {
425        # Now write a blank string to this oid in the info database
426    my $index_text_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text");
427    my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collection, $index_text_directory_path);
428    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $infodb_file_path, "append");
429    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $oid, &dbutil::convert_infodb_string_to_hash(""));
430        # Remove reverse lookup
431    &dbutil::write_infodb_entry($infodbtype, $infodb_file_handle, $doc_num, &dbutil::convert_infodb_string_to_hash(""));
432    &dbutil::close_infodb_write_handle($infodbtype, $infodb_file_handle);
433
434        # And remove from the database
435        &callGS2LuceneDelete($collection, $doc_num);
436
437        # Regenerate the classifier trees.
438        print STDERR "* load collection configuration\n";# unless !$debug;
439        my $config_obj = &getConfigObj($collection);
440        my $clidx = 1;
441        my %classifier_tree_models = ();
442        foreach my $classifier (@{$config_obj->{'classify'}})
443          {
444            my $index = 0;
445            my $option_count = scalar(@{$classifier});
446            for ($index = 0; $index < $option_count; $index++)
447              {
448                if ($index + 1 < $option_count && @{$classifier}[$index] eq "-metadata")
449                  {
450                    my $key = @{$classifier}[$index + 1];
451                    # Create a tree model for this classifier
452                    print STDERR "* creating a tree model for classifier: CL" . $clidx . " [" . $key . "]\n";# unless !$debug;
453                    my $tree_model_obj = new ClassifyTreeModel($collection, $infodbtype, "CL" . $clidx);
454                    # And store it against its key for later
455                    $classifier_tree_models{$key} = $tree_model_obj;
456                  }
457              }
458            $clidx++;
459          }
460       
461        # For each piece of metadata assigned to this document, if there is a
462        # matching classifier tree, remove the path from the tree.
463        print STDERR "* searching for classifier paths to be removed\n";
464       
465        my $metadata = $doc_obj->getAllMetadata();
466        foreach my $pair (@$metadata)
467          {
468            my ($key, $value) = @$pair;
469            print STDERR "* testing " . $key . "=>" . $value . "\n";
470            if (defined($classifier_tree_models{$key}))
471              {
472                my $model = $classifier_tree_models{$key};
473                print STDERR "* removing '" . $value . "' from classifier " . $model->getRootNode()->getCLID() . "\n";
474                $model->removeDocument($value, $oid, 1);
475              }
476          }
477
478        # We also have to remove from browselist - the reverse process of
479        # adding to browselist shown above.
480        my $dummy_model = new ClassifyTreeModel($collection, $infodbtype, "");
481        my $browselist_node = new ClassifyTreeNode($dummy_model, "browselist");
482        # Add the document
483        $browselist_node->removeDocument($oid);
484        # Clean up
485      }
486    # else, no document, no need to delete.
487  }
488## deleteDocument() ##
489
490# /**
491#  */
492sub getFieldFromBuildCFG()
493  {
494    my ($indexpath, $key) = @_;
495    my $field = "";
496    my $build_cfg = &util::filename_cat($indexpath, "build.cfg");
497    # If there isn't a build.cfg then the index hasn't been built and there is
498    # nothing to do
499    if(open(BUILDCFG, $build_cfg))
500      {
501        # For each line of the build configuration
502        my $line;
503        while($line = <BUILDCFG>)
504          {
505            # Only interested in the indexfieldmap line
506            if($line =~ /^indexfieldmap\s+/)
507              {
508                # Extract the field information by looking up the key pair
509                if($line =~ /\s$key->(\w\w)/)
510                  {
511                    $field = $1;
512                  }
513              }
514          }
515        # Done with file
516        close(BUILDCFG);
517      }
518    # Return whatever we found
519    return $field;
520  }
521# /** getFieldFromBuildCFG() **/
522
523
524
525
526
527# /** Retrieve an object (associative array) containing information about the
528#  *  collection configuration.
529#  *  @param  $collection The shortname of the collection as a string
530#  *  @return An associative array containing information from the collect.cfg
531#  *  @author John Thompson, DL Consulting Ltd.
532#  */
533sub getConfigObj()
534  {
535    my ($collection) = @_;
536
537    #rint STDERR "getConfigObj()\n" unless !$debug;
538
539    my $colcfgname = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "etc", "collect.cfg");
540    if (!-e $colcfgname)
541      {
542        die "IncrementalBuildUtils - couldn't find collect.cfg for collection $collection\n";
543      }
544    return &colcfg::read_collect_cfg ($colcfgname);
545  }
546# /** getConfigObj() **/
547
5481;
Note: See TracBrowser for help on using the browser.