source: main/trunk/greenstone2/perllib/ClassifyTreeNode.pm@ 21561

Last change on this file since 21561 was 21561, checked in by mdewsnip, 14 years ago

Changed calls to GDBMUtils::gdbmCachedCollectionGet() to dbutil::read_infodb_entry(). Part of removing GDBMUtils.pm and making the code less GDBM-specific.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.6 KB
Line 
1package ClassifyTreeNode;
2
3use ClassifyTreeNode;
4use ClassifyTreePath;
5use GDBMUtils;
6use strict;
7
8# /** Constructor
9# *
10# *
11# * @author John Thompson, DL Consulting Ltd.
12# */
13sub new()
14 {
15 my ($class, $model, $clid, $force_new) = @_;
16 my $debug = 0;
17 $force_new = 0 unless defined($force_new);
18 print STDERR "ClassifyTreeNode.new(model, \"$clid\", $force_new)\n" unless !$debug;
19 $force_new = 0 unless defined($force_new);
20 # Test the parameters
21 die("Can't create a tree node that doesn't belong to a tree model!") unless $model;
22 die("Can't create a tree node that doesn't have a unique id (OID)!") unless $clid;
23 # Store the variables
24 my $self = {};
25 $self->{'debug'} = $debug;
26 $self->{'model'} = $model;
27 $self->{'clid'} = $clid;
28
29 my $collection = $model->getCollection();
30
31 my $index_text_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text");
32 $self->{'infodb_file_path'} = &dbutil::get_infodb_file_path("gdbm", $collection, $index_text_directory_path);
33
34 # Check if this node already exists in the database, and if not insert it
35 # now
36 my $text = &dbutil::read_infodb_entry("gdbm", $self->{'infodb_file_path'}, $clid);
37 if($text !~ /\w+/ && $force_new)
38 {
39 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid,
40 "<doctype>classify\n<hastxt>0\n<childtype>VList\n<Title>\n<numleafdocs>0\n<contains>\n");
41 }
42 # Bless me father for I have sinned
43 bless $self, $class;
44 return $self;
45 }
46# /** new() **/
47
48# /** Add a document to this tree node.
49# *
50# * @param $oid The unique identifier of the document to add
51# *
52# * @author John Thompson, DL Consulting Ltd
53# */
54sub addDocument()
55 {
56 my ($self, $oid) = @_;
57 print STDERR "ClassifyTreeNode.addDocument(\"$oid\")\n" unless !$self->{'debug'};
58 # Get the current contains list
59 my $contains = $self->getContains();
60 # See whether this document already exists in the contains
61 if ($contains !~ /(^$oid$|^$oid;$|;$oid;|;$oid$)/)
62 {
63 # If not, append to the contains list
64 if ($contains ne "")
65 {
66 $contains .= ";$oid";
67 }
68 else
69 {
70 $contains = $oid;
71 }
72 # Store the changed contains
73 $self->setContains($contains);
74 # We now have to update the numleafdocs count for this node and its
75 # ancestor nodes
76 my $cur_node_obj = $self;
77 while ($cur_node_obj)
78 {
79 my $numleafdocs = $cur_node_obj->getNumLeafDocs();
80 if ($numleafdocs =~ /^\d+$/)
81 {
82 $numleafdocs ++;
83 }
84 else
85 {
86 $numleafdocs = 1;
87 }
88 $cur_node_obj->setNumLeafDocs($numleafdocs);
89 $cur_node_obj = $cur_node_obj->getParentNode();
90 }
91 }
92 else
93 {
94 print STDERR "Document already exists!\n" unless !$self->{'debug'};
95 }
96 }
97# /** addDocument() **/
98
99# /** Changes the CLID of a particular node. Note that this is significantly
100# * tricky step, as we have to remove the old node from the database, and
101# * then readd with the corrected CLID.
102# *
103# * @param $clid The new CLID as an integer
104# *
105# * @author John Thompson, DL Consulting Ltd.
106# */
107sub changeCLID()
108 {
109 my($self, $new_clid) = @_;
110 print STDERR "ClassifyTreeNode.changeCLID(\"$new_clid\")\n" unless !$self->{'debug'};
111 # Store the current clid for later use
112 my $old_clid = $self->{'clid'};
113 # And record the children now, as they'll change after we shift the parent
114 # CLID
115 my @child_nodes = $self->getChildren();
116
117 # Retrieve the current document
118 my $text = $self->toString();
119
120 my $collection = $self->{'model'}->getCollection();
121
122 # Create a new document with the correct CLID
123 &GDBMUtils::gdbmCachedCollectionSet($collection, $new_clid, $text);
124
125 # Remove the old document
126 &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'clid'});
127
128 # Finally, change the clid stored in this document
129 $self->{'clid'} = $new_clid;
130
131 # Now go through this nodes children, and shift them too
132 foreach my $child_node (@child_nodes)
133 {
134 # We determine the new clid by retrieving the childs current clid,
135 # and then replacing any occurance to the parents old clid with the
136 # parents new clid
137 my $old_child_clid = $child_node->getCLID();
138 #rint STDERR "* considering: " . $old_child_clid . "\n";
139 if($old_child_clid =~ /^CL/)
140 {
141 my $new_child_clid = $new_clid . substr($old_child_clid, length($old_clid));
142 #rint STDERR "* shifting child $old_child_clid to $new_child_clid\n";
143 $child_node->changeCLID($new_child_clid);
144 }
145 }
146 }
147# /** changeCLID() **/
148
149
150# /** Retrieve the unique id for this classifier.
151# *
152# * @return The CLID as a string
153# *
154# * @author John Thompson, DL Consulting Ltd.
155# */
156sub getCLID()
157 {
158 my ($self) = @_;
159 print STDERR "ClassifyTreeNode.getCLID()\n" unless !$self->{'debug'};
160 return $self->{'clid'};
161 }
162
163# /** Return the child objects of this node an as array.
164# *
165# * @return An array of node objects
166# *
167# * @author John Thompson, DL Consulting Ltd.
168# */
169sub getChildren()
170 {
171 my ($self) = @_;
172 print STDERR "ClassifyTreeNode.getChildren()\n" unless !$self->{'debug'};
173 my $text = $self->toString();
174 my @children = ();
175 # Retrieve the contains metadata item
176 if($text =~ /<contains>(.*?)\r?\n/)
177 {
178 #rint STDERR "* children formed from contains: $1\n";
179 my $contains_raw = $1;
180 my @contains = split(/;/, $contains_raw);
181 foreach my $child_clid (@contains)
182 {
183 # Replace the " with the parent clid
184 $child_clid =~ s/\"/$self->{'clid'}/;
185 # Create the node obj
186 my $child_node_obj = new ClassifyTreeNode($self->{'model'}, $child_clid);
187 # And insert into ever growing array of child nodes
188 push(@children, $child_node_obj);
189 }
190 }
191 return @children;
192 }
193# /** getChildren() **/
194
195# /** Retrieve the contains metadata which is used to determine this nodes
196# * children.
197# *
198# * @return The contains metadata as a string
199# *
200# * @author John Thompson, DL Consulting Ltd.
201# */
202sub getContains()
203 {
204 my ($self) = @_;
205 print STDERR "ClassifyTreeNode.getContains()\n" unless !$self->{'debug'};
206 my $result = 0;
207 my $text = $self->toString();
208 if($text =~ /<contains>(.*?)\r?\n/)
209 {
210 $result = $1;
211 # Replace " with this nodes CLID
212 $result =~ s/\"/$self->{'clid'}/g;
213 }
214 return $result;
215 }
216# /** getContains() **/
217
218# /** Retrieve this nodes next sibling.
219# *
220# * @return The next sibling node object or 0 if no such node
221# *
222# * @author John Thompson, DL Consulting Ltd.
223# */
224sub getNextSibling()
225 {
226 my ($self) = @_;
227 print STDERR "ClassifyTreeNode.getNextSibling()\n" unless !$self->{'debug'};
228 my $sibling_node = 0;
229 # The next sibling would be the node identified by the CLID with its
230 # suffix number one greater than this nodes CLID.
231 my @clid_parts = split(/\./, $self->{'clid'});
232 my $suffix = pop(@clid_parts);
233 $suffix++;
234 push(@clid_parts, $suffix);
235 my $next_clid = join(".", @clid_parts);
236
237 my $collection = $self->{'model'}->getCollection();
238
239 # Now determine if this node exists.
240 if (&dbutil::read_infodb_entry("gdbm", $self->{'infodb_file_path'}, $next_clid) =~ /\w+/)
241 {
242 # And if so, create it.
243 $sibling_node = new ClassifyTreeNode($self->{'model'}, $next_clid);
244 }
245 # Done
246 return $sibling_node;
247 }
248# /** getNextSibling() **/
249
250# /** Retrieve the numleafdocs metadata which if affected by any changes to
251# * child nodes.
252# *
253# * @return The numleafdocs as an integer
254# *
255# * @author John Thompson, DL Consulting Ltd.
256# */
257sub getNumLeafDocs()
258 {
259 my ($self) = @_;
260 print STDERR "ClassifyTreeNode.getNumLeafDocs()\n" unless !$self->{'debug'};
261 my $result = 0;
262 my $text = $self->toString();
263 if($text =~ /<numleafdocs>(\d*?)\r?\n/)
264 {
265 $result = $1;
266 }
267 return $result;
268 }
269# /** getNumLeafDocs() **/
270
271# /** Retrieve the parent node of the given node.
272# *
273# * @param $child_node The node whose parent we want to retrieve
274# * @return The parent node, or 0 if this is the root
275# *
276# * @author John Thompson, DL Consulting Ltd.
277# */
278sub getParentNode()
279 {
280 my ($self) = @_;
281 print STDERR "ClassifyTreeNode.getParentNode()\n" unless !$self->{'debug'};
282 my $parent_node = 0;
283 my $child_clid = $self->getCLID();
284 my @clid_parts = split(/\./, $child_clid);
285 if(scalar(@clid_parts) > 1)
286 {
287 pop(@clid_parts);
288 my $parent_clid = join(".", @clid_parts);
289 $parent_node = $self->{'model'}->getNodeByCLID($parent_clid);
290 }
291 # Otherwise we are already at the root node
292 return $parent_node;
293 }
294# /** getParentNode() **/
295
296# /** Retrieve the path to this node.
297# *
298# * @return The path obj which represents the path to this node or 0 if no
299# * path information exists
300# *
301# * @author John Thompson, DL Consulting Ltd.
302# */
303sub getPath()
304 {
305 my ($self) = @_;
306 print STDERR "ClassifyTreeNode.getPath()\n" unless !$self->{'debug'};
307 my $result = 0;
308 my $text = $self->toString();
309 if($text =~ /<Title>(.*?)\r?\n/ )
310 {
311 my $this_component = $1;
312 # If this node has a parent, then retrieve its path
313 my $parent_node = $self->getParentNode();
314 if ($parent_node)
315 {
316 # Get the path...
317 $result = $parent_node->getPath();
318 # ... and add our component
319 $result->addPathComponent($this_component);
320 }
321 else
322 {
323 $result = new ClassifyTreePath($this_component);
324 }
325 }
326 return $result;
327 }
328# /** getPath() **/
329
330# /** Retrieve the title of this node. This returns essentially the same
331# * information as getPath, but without the encapsulating object.
332# *
333# * @return The title as a string
334# *
335# * @author John Thompson, DL Consulting Ltd.
336# */
337sub getTitle()
338 {
339 my ($self) = @_;
340 print STDERR "ClassifyTreeNode.getTitle()\n" unless !$self->{'debug'};
341 my $result = 0;
342 my $text = $self->toString();
343 if($text =~ /<Title>(.*?)\r?\n/)
344 {
345 $result = $1;
346 }
347 return $result;
348 }
349# /** getTitle() **/
350
351# /** Using the given value locate the correct position to insert a new node,
352# * create it, and then establish it in the database.
353# *
354# * @param $path The path used to determine where to insert node as a string
355# * @return The newly inserted node object
356# *
357# * @author John Thompson, DL Consulting Ltd.
358# */
359sub insertNode()
360 {
361 my ($self, $path) = @_;
362 print STDERR "ClassifyTreeNode.insertNode(\"$path\")\n" unless !$self->{'debug'};
363 my $child_clid = "";
364 my $child_node = 0;
365 my $new_contains = "";
366 # Get the children of this node
367 my @children = $self->getChildren();
368 # If there are no current children, then this will be the first
369 if (scalar(@children) == 0)
370 {
371 #rint STDERR "ClassifyTreeNode.insertNode: first child!\n";
372 $child_clid = $self->{'clid'} . ".1"; # First child
373 # Contains needs to have this new clid added
374 $new_contains = $child_clid;
375 }
376 # Otherwise search through the current children, looking at their values
377 # to locate where to insert this node.
378 else
379 {
380 #rint STDERR "ClassifyTreeNode.insertNode: searching for position...\n";
381 my $found = 0;
382 my $offset = 1;
383 foreach my $sibling_node (@children)
384 {
385 my $sibling_path = $sibling_node->getPath();
386 # If we are still searching for the insertion point
387 if(!$found)
388 {
389 if($sibling_path->toString() eq $path->toString())
390 {
391 # What?!? This node already exists! why are we adding it again!
392 print STDERR "ClassifyTreeNode.insertNode: what?!? node already exists... how did we get here?\n";
393 return $sibling_node;
394 }
395 elsif($sibling_path->toString() gt $path->toString())
396 {
397 # Found our location!
398 $found = 1;
399 $child_clid = $self->{'clid'} . "." . $offset;
400 # You may notice we haven't added this node to contains.
401 # This is because the parent node already contains this
402 # clid - instead we need to record the new highest clid
403 # created when we move the sibling nodes for here onwards
404 # up one space.
405 #rint STDERR "ClassifyTreeNode.insertNode: found our location: $child_clid \n";
406 last;
407 }
408 }
409 $offset++;
410 }
411 # If we haven't found the node, we insert at the end.
412 if(!$found)
413 {
414 #rint STDERR "ClassifyTreeNode.insertNode not found... insert at end \n";
415 $child_clid = $self->{'clid'} . "." . $offset;
416 # Contains needs to have this new clid added
417 $new_contains = $child_clid;
418 }
419 # If we did find the node, we now have to go through the sibling nodes
420 # shifting them up one CLID to ensure there's space.
421 else
422 {
423 # We need another copy of children, but this time with the last
424 # children first!
425 @children = reverse $self->getChildren();
426 my $offset2 = scalar(@children) + 1;
427 foreach my $sibling_node (@children)
428 {
429 $sibling_node->changeCLID($self->{'clid'} . "." . $offset2);
430 # If this if the highest sibling node we are going to rename,
431 # then use it to set the contains metadata.
432 if($new_contains !~ /\w+/)
433 {
434 $new_contains = $self->{'clid'} . "." . $offset2;
435 }
436 # Once we've processed the node exactly in the space the new
437 # node will occupy, we're done.
438 $offset2--;
439 if($offset2 == $offset)
440 {
441 last;
442 }
443 }
444 }
445 }
446 $child_node = new ClassifyTreeNode($self->{'model'}, $child_clid, 1);
447 # Set the value, as this is the only piece of metadata we know and care
448 # about at this stage
449 $child_node->setTitle($path->getLastPathComponent());
450 # Update the contains metadata for this node
451 my $contains = $self->getContains();
452 if($contains =~ /\w/)
453 {
454 $contains .= ";" . $new_contains;
455 }
456 else
457 {
458 $contains = $new_contains;
459 }
460 $self->setContains($contains);
461 # And return the node
462 return $child_node;
463 }
464# /** insertNode() **/
465
466# /** Remove all the children of this node and return the number of document
467# * references (leaf nodes) removed by this process.
468# *
469# * @return The count of document references removed as an integer
470# *
471# * @author John Thompson, DL Consulting Ltd
472# */
473sub removeAllNodes()
474 {
475 my ($self) = @_;
476 print STDERR "ClassifyTreeNode.removeAllNodes()\n" unless !$self->{'debug'};
477 my $num_leaf_docs = 0;
478 # Recursively remove this nodes children
479 my @children = $self->getChildren();
480 foreach my $child_node (@children)
481 {
482 $child_node->removeAllNodes();
483 }
484 # Retrieve the document count (leaf docs)
485 my $text = $self->toString();
486 if ($text =~ /<numleafdocs>(\d+)/)
487 {
488 $num_leaf_docs += $1;
489 }
490 # Now remove the node from the database. We do this calling set gdbm with
491 # no value argument.
492 my $collection = $self->{'model'}->getCollection();
493 &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'clid'});
494
495 # Return the leaf count (so we can adjust the numleafdocs at the root node
496 # of this deletion.
497 return $num_leaf_docs;
498 }
499# /** removeAllNodes() **/
500
501# /** Remove the given document this node, and then update the numleafdocs
502# * metadata for all the ancestor nodes.
503# *
504# * @param $oid The unique identifier of a greenstone document
505# *
506# * @author John Thompson, DL Consulting Ltd.
507# */
508sub removeDocument()
509 {
510 my ($self, $oid) = @_;
511 print STDERR "ClassifyTreeNode::removeDocument(\"$oid\")\n" unless !$self->{'debug'};
512 # Retrieve the contains metadata
513 my $contains = $self->getContains();
514 # Remove this oid
515 my @contains_parts = split(/;/, $contains);
516 my @new_contains_parts = ();
517 foreach my $oid_or_clid (@contains_parts)
518 {
519 if ($oid ne $oid_or_clid && $oid_or_clid =~ /[\w\d]+/)
520 {
521 push(@new_contains_parts, $oid_or_clid);
522 }
523 }
524 $contains = join(";", @new_contains_parts);
525 $self->setContains($contains);
526 # We now have to update the numleafdocs count for this node and its
527 # ancestor nodes
528 my $cur_node_obj = $self;
529 while ($cur_node_obj)
530 {
531 my $numleafdocs = $cur_node_obj->getNumLeafDocs();
532 if ($numleafdocs =~ /^\d+$/)
533 {
534 $numleafdocs--;
535 }
536 else
537 {
538 $numleafdocs = 0;
539 }
540 $cur_node_obj->setNumLeafDocs($numleafdocs);
541 $cur_node_obj = $cur_node_obj->getParentNode();
542 }
543 # Done
544 }
545# /** removeDocument() **/
546
547# /** Remove the node denoted by the path.
548# *
549# * @param $child_node The node to be removed
550# *
551# * @author John Thompson, DL Consulting Ltd
552# */
553sub removeNode()
554 {
555 my ($self, $child_node) = @_;
556 # Not as easy as it first sounds as we have to do a recursive remove,
557 # keeping track of any documents removed so we can update document count.
558 # We then remove this node, adjusting the sibling's clid's as necessary
559 # before altering the contains.
560 print STDERR "ClassifyTreeNode::removeNode(child_node)\n" unless !$self->{'debug'};
561 my $remove_clid = $child_node->getCLID();
562 my $sibling_node = $child_node->getNextSibling();
563 # Recursively remove this nodes and its children, taking note of decrease
564 # in document count.
565 my $removed_numleafdocs = $child_node->removeAllNodes();
566 # Determine if removing this node requires other nodes to be moved, and if
567 # so, do so. We do this in a repeating loop until there are no further
568 # siblings, overwriting the $remove_clid variable with the clid of the node
569 # just changed (you'll see why in a moment).
570 while ($sibling_node != 0)
571 {
572 my $current_node = $sibling_node;
573 # Get this nodes sibling
574 $sibling_node = $current_node->getNextSibling();
575 # Record the CLID to change to
576 my $new_clid = $remove_clid;
577 # Record the old clid
578 $remove_clid = $current_node->getCLID();
579 # Modify the clid of the current node
580 $current_node->changeCLID($new_clid);
581 # Continue until there are no further sibling nodes
582 }
583 # By now the $remove_clid will contain the CLID that has to be removed from
584 # the contains metadata for this node
585 my $contains = $self->getContains();
586 my @contains_parts = split(/;/, $contains);
587 my @new_contains_parts = ();
588 foreach my $oid_or_clid (@contains_parts)
589 {
590 if ($remove_clid ne $oid_or_clid && $oid_or_clid =~ /[\w\d]+/)
591 {
592 push(@new_contains_parts, $oid_or_clid);
593 }
594 }
595 $contains = join(";", @new_contains_parts);
596 $self->setContains($contains);
597 # We also alter the numleafdocs metadata to reflect the removal of these
598 # nodes.
599 my $numleafdocs = $self->getNumLeafDocs();
600 if ($numleafdocs =~ /^\d+$/)
601 {
602 $numleafdocs -= $removed_numleafdocs;
603 }
604 else
605 {
606 $numleafdocs = 0;
607 }
608 $self->setNumLeafDocs($numleafdocs);
609 # Done
610 }
611# /** removeNode() **/
612
613# /** Set the contains metadata in the database.
614# *
615# * @param $contains The new contains string
616# *
617# * @author John Thompson, DL Consulting Ltd.
618# */
619sub setContains()
620 {
621 my ($self, $contains) = @_;
622 print STDERR "ClassifyTreeNode::setContains(\"$contains\")\n" unless !$self->{'debug'};
623 # Replace any occurance of this nodes CLID with "
624 $contains =~ s/$self->{'clid'}/\"/g;
625
626 my $collection = $self->{'model'}->getCollection();
627 my $clid = $self->{'clid'};
628
629 # Load the text of this node
630 my $text = &dbutil::read_infodb_entry("gdbm", $self->{'infodb_file_path'}, $clid);
631
632 # Replace the contains
633 #rint STDERR "Before: $text\n";
634 $text =~ s/<contains>.*?\n+/<contains>$contains\n/;
635 #rint STDERR "After: $text\n";
636 # Store the changed text
637 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
638 }
639# /** setContains() **/
640
641# /** Set the numleafdocs metadata in the database.
642# *
643# * @param $numleafdocs The new count of leaf documents
644# *
645# * @author John Thompson, DL Consulting Ltd.
646# */
647sub setNumLeafDocs()
648 {
649 my ($self, $numleafdocs) = @_;
650 print STDERR "ClassifyTreeNode::setNumLeafDocs(numleafdocs)\n" unless !$self->{'debug'};
651
652 my $collection = $self->{'model'}->getCollection();
653 my $clid = $self->{'clid'};
654
655 # Load the text of this node
656 my $text = &dbutil::read_infodb_entry("gdbm", $self->{'infodb_file_path'}, $clid);
657 # Replace the numleafdocs
658 $text =~ s/<numleafdocs>\d*?\n+/<numleafdocs>$numleafdocs\n/;
659 # Store the changed text
660 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
661 }
662# /** setNumLeafDocs() **/
663
664# /** Set the title metadata in the database.
665# * Note: Previously this was value and we extracted the title, but the new
666# * autohierarchies don't set values.
667# *
668# * @param $title The new title string
669# *
670# * @author John Thompson, DL Consulting Ltd.
671# */
672sub setTitle()
673 {
674 my ($self, $title) = @_;
675 print STDERR "ClassifyTreeNode::setTitle(\"$title\")\n" unless !$self->{'debug'};
676
677 my $collection = $self->{'model'}->getCollection();
678 my $clid = $self->{'clid'};
679
680 # Load the text of this node
681 my $text = &dbutil::read_infodb_entry("gdbm", $self->{'infodb_file_path'}, $clid);
682 # Replace the title
683 $text =~ s/<Title>.*?\n+/<Title>$title\n/;
684 # Store the changed text
685 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
686 }
687# /** setValue() **/
688
689# /** Represent this node as a string.
690# *
691# * @return The string representation of this node
692# *
693# * @author John Thompson, DL Consulting Ltd.
694# */
695sub toString()
696 {
697 my ($self) = @_;
698 print STDERR "ClassifyTreeNode::toString()\n" unless !$self->{'debug'};
699 my $collection = $self->{'model'}->getCollection();
700 my $clid = $self->{'clid'};
701
702 my $text = &dbutil::read_infodb_entry("gdbm", $self->{'infodb_file_path'}, $clid);
703 return $text;
704 }
705# /** toString() **/
706
7071;
Note: See TracBrowser for help on using the repository browser.