source: gsdl/trunk/perllib/ClassifyTreeNode.pm@ 14374

Last change on this file since 14374 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.8 KB
Line 
1package ClassifyTreeNode;
2
3use ClassifyTreeNode;
4use ClassifyTreePath;
5use GDBMUtils;
6
7# /** Constructor
8# *
9# *
10# * @author John Thompson, DL Consulting Ltd.
11# */
12sub new()
13 {
14 my ($class, $model, $clid, $force_new) = @_;
15 my $debug = 0;
16 $force_new = 0 unless defined($force_new);
17 print STDERR "ClassifyTreeNode.new(model, \"$clid\", $force_new)\n" unless !$debug;
18 $force_new = 0 unless defined($force_new);
19 # Test the parameters
20 die("Can't create a tree node that doesn't belong to a tree model!") unless $model;
21 die("Can't create a tree node that doesn't have a unique id (OID)!") unless $clid;
22 # Store the variables
23 my $self = {};
24 $self->{'debug'} = $debug;
25 $self->{'model'} = $model;
26 $self->{'clid'} = $clid;
27 # Check if this node already exists in the database, and if not insert it
28 # now
29 my $text = &GDBMUtils::gdbmGet($model->getCollection(), $clid);
30 if($text !~ /\w+/ && $force_new)
31 {
32 &GDBMUtils::gdbmSet($model->getCollection(), $clid,
33 "<doctype>classify\n<hastxt>0\n<childtype>VList\n<Title>\n<numleafdocs>0\n<contains>\n");
34 }
35 # Bless me father for I have sinned
36 bless $self, $class;
37 return $self;
38 }
39# /** new() **/
40
41# /** Add a document to this tree node.
42# *
43# * @param $oid The unique identifier of the document to add
44# *
45# * @author John Thompson, DL Consulting Ltd
46# */
47sub addDocument()
48 {
49 my ($self, $oid) = @_;
50 print STDERR "ClassifyTreeNode.addDocument(\"$oid\")\n" unless !$self->{'debug'};
51 # Get the current contains list
52 my $contains = $self->getContains();
53 # See whether this document already exists in the contains
54 if ($contains !~ /(^$oid$|^$oid;$|;$oid;|;$oid$)/)
55 {
56 # If not, append to the contains list
57 if ($contains ne "")
58 {
59 $contains .= ";$oid";
60 }
61 else
62 {
63 $contains = $oid;
64 }
65 # Store the changed contains
66 $self->setContains($contains);
67 # We now have to update the numleafdocs count for this node and its
68 # ancestor nodes
69 my $cur_node_obj = $self;
70 while ($cur_node_obj)
71 {
72 my $numleafdocs = $cur_node_obj->getNumLeafDocs();
73 if ($numleafdocs =~ /^\d+$/)
74 {
75 $numleafdocs ++;
76 }
77 else
78 {
79 $numleafdocs = 1;
80 }
81 $cur_node_obj->setNumLeafDocs($numleafdocs);
82 $cur_node_obj = $cur_node_obj->getParentNode();
83 }
84 }
85 else
86 {
87 print STDERR "Document already exists!\n" unless !$self->{'debug'};
88 }
89 }
90# /** addDocument() **/
91
92# /** Changes the CLID of a particular node. Note that this is significantly
93# * tricky step, as we have to remove the old node from the database, and
94# * then readd with the corrected CLID.
95# *
96# * @param $clid The new CLID as an integer
97# *
98# * @author John Thompson, DL Consulting Ltd.
99# */
100sub changeCLID()
101 {
102 my($self, $new_clid) = @_;
103 print STDERR "ClassifyTreeNode.changeCLID(\"$new_clid\")\n" unless !$self->{'debug'};
104 # Store the current clid for later use
105 my $old_clid = $self->{'clid'};
106 # And record the children now, as they'll change after we shift the parent
107 # CLID
108 my @child_nodes = $self->getChildren();
109 # Retrieve the current document
110 my $text = $self->toString();
111 # Create a new document with the correct CLID
112 &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $new_clid, $text);
113 # Remove the old document
114 &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'});
115 # Finally, change the clid stored in this document
116 $self->{'clid'} = $new_clid;
117 # Now go through this nodes children, and shift them too
118 foreach my $child_node (@child_nodes)
119 {
120 # We determine the new clid by retrieving the childs current clid,
121 # and then replacing any occurance to the parents old clid with the
122 # parents new clid
123 $old_child_clid = $child_node->getCLID();
124 #rint STDERR "* considering: " . $old_child_clid . "\n";
125 if($old_child_clid =~ /^CL/)
126 {
127 $new_child_clid = $new_clid . substr($old_child_clid, length($old_clid));
128 #rint STDERR "* shifting child $old_child_clid to $new_child_clid\n";
129 $child_node->changeCLID($new_child_clid);
130 }
131 }
132 }
133# /** changeCLID() **/
134
135
136# /** Retrieve the unique id for this classifier.
137# *
138# * @return The CLID as a string
139# *
140# * @author John Thompson, DL Consulting Ltd.
141# */
142sub getCLID()
143 {
144 my ($self) = @_;
145 print STDERR "ClassifyTreeNode.getCLID()\n" unless !$self->{'debug'};
146 return $self->{'clid'};
147 }
148
149# /** Return the child objects of this node an as array.
150# *
151# * @return An array of node objects
152# *
153# * @author John Thompson, DL Consulting Ltd.
154# */
155sub getChildren()
156 {
157 my ($self) = @_;
158 print STDERR "ClassifyTreeNode.getChildren()\n" unless !$self->{'debug'};
159 my $text = $self->toString();
160 my @children = ();
161 # Retrieve the contains metadata item
162 if($text =~ /<contains>(.*?)\r?\n/)
163 {
164 #rint STDERR "* children formed from contains: $1\n";
165 my $contains_raw = $1;
166 my @contains = split(/;/, $contains_raw);
167 foreach my $child_clid (@contains)
168 {
169 # Replace the " with the parent clid
170 $child_clid =~ s/\"/$self->{'clid'}/;
171 # Create the node obj
172 my $child_node_obj = new ClassifyTreeNode($self->{'model'}, $child_clid);
173 # And insert into ever growing array of child nodes
174 push(@children, $child_node_obj);
175 }
176 }
177 return @children;
178 }
179# /** getChildren() **/
180
181# /** Retrieve the contains metadata which is used to determine this nodes
182# * children.
183# *
184# * @return The contains metadata as a string
185# *
186# * @author John Thompson, DL Consulting Ltd.
187# */
188sub getContains()
189 {
190 my ($self) = @_;
191 print STDERR "ClassifyTreeNode.getContains()\n" unless !$self->{'debug'};
192 my $result = 0;
193 my $text = $self->toString();
194 if($text =~ /<contains>(.*?)\r?\n/)
195 {
196 $result = $1;
197 # Replace " with this nodes CLID
198 $result =~ s/\"/$self->{'clid'}/g;
199 }
200 return $result;
201 }
202# /** getContains() **/
203
204# /** Retrieve this nodes next sibling.
205# *
206# * @return The next sibling node object or 0 if no such node
207# *
208# * @author John Thompson, DL Consulting Ltd.
209# */
210sub getNextSibling()
211 {
212 my ($self) = @_;
213 print STDERR "ClassifyTreeNode.getNextSibling()\n" unless !$self->{'debug'};
214 my $sibling_node = 0;
215 # The next sibling would be the node identified by the CLID with its
216 # suffix number one greater than this nodes CLID.
217 my @clid_parts = split(/\./, $self->{'clid'});
218 my $suffix = pop(@clid_parts);
219 $suffix++;
220 push(@clid_parts, $suffix);
221 my $next_clid = join(".", @clid_parts);
222 # Now determine if this node exists.
223 if(&GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $next_clid) =~ /\w+/)
224 {
225 # And if so, create it.
226 $sibling_node = new ClassifyTreeNode($self->{'model'}, $next_clid);
227 }
228 # Done
229 return $sibling_node;
230 }
231# /** getNextSibling() **/
232
233# /** Retrieve the numleafdocs metadata which if affected by any changes to
234# * child nodes.
235# *
236# * @return The numleafdocs as an integer
237# *
238# * @author John Thompson, DL Consulting Ltd.
239# */
240sub getNumLeafDocs()
241 {
242 my ($self) = @_;
243 print STDERR "ClassifyTreeNode.getNumLeafDocs()\n" unless !$self->{'debug'};
244 my $result = 0;
245 my $text = $self->toString();
246 if($text =~ /<numleafdocs>(\d*?)\r?\n/)
247 {
248 $result = $1;
249 }
250 return $result;
251 }
252# /** getNumLeafDocs() **/
253
254# /** Retrieve the parent node of the given node.
255# *
256# * @param $child_node The node whose parent we want to retrieve
257# * @return The parent node, or 0 if this is the root
258# *
259# * @author John Thompson, DL Consulting Ltd.
260# */
261sub getParentNode()
262 {
263 my ($self) = @_;
264 print STDERR "ClassifyTreeNode.getParentNode()\n" unless !$self->{'debug'};
265 my $parent_node = 0;
266 my $child_clid = $self->getCLID();
267 my @clid_parts = split(/\./, $child_clid);
268 if(scalar(@clid_parts) > 1)
269 {
270 pop(@clid_parts);
271 my $parent_clid = join(".", @clid_parts);
272 $parent_node = $self->{'model'}->getNodeByCLID($parent_clid);
273 }
274 # Otherwise we are already at the root node
275 return $parent_node;
276 }
277# /** getParentNode() **/
278
279# /** Retrieve the path to this node.
280# *
281# * @return The path obj which represents the path to this node or 0 if no
282# * path information exists
283# *
284# * @author John Thompson, DL Consulting Ltd.
285# */
286sub getPath()
287 {
288 my ($self) = @_;
289 print STDERR "ClassifyTreeNode.getPath()\n" unless !$self->{'debug'};
290 my $result = 0;
291 my $text = $self->toString();
292 if($text =~ /<Title>(.*?)\r?\n/ )
293 {
294 my $this_component = $1;
295 # If this node has a parent, then retrieve its path
296 my $parent_node = $self->getParentNode();
297 if ($parent_node)
298 {
299 # Get the path...
300 $result = $parent_node->getPath();
301 # ... and add our component
302 $result->addPathComponent($this_component);
303 }
304 else
305 {
306 $result = new ClassifyTreePath($this_component);
307 }
308 }
309 return $result;
310 }
311# /** getPath() **/
312
313# /** Retrieve the title of this node. This returns essentially the same
314# * information as getPath, but without the encapsulating object.
315# *
316# * @return The title as a string
317# *
318# * @author John Thompson, DL Consulting Ltd.
319# */
320sub getTitle()
321 {
322 my ($self) = @_;
323 print STDERR "ClassifyTreeNode.getTitle()\n" unless !$self->{'debug'};
324 my $result = 0;
325 my $text = $self->toString();
326 if($text =~ /<Title>(.*?)\r?\n/)
327 {
328 $result = $1;
329 }
330 return $result;
331 }
332# /** getTitle() **/
333
334# /** Using the given value locate the correct position to insert a new node,
335# * create it, and then establish it in the database.
336# *
337# * @param $path The path used to determine where to insert node as a string
338# * @return The newly inserted node object
339# *
340# * @author John Thompson, DL Consulting Ltd.
341# */
342sub insertNode()
343 {
344 my ($self, $path) = @_;
345 print STDERR "ClassifyTreeNode.insertNode(\"$path\")\n" unless !$self->{'debug'};
346 my $child_clid = "";
347 my $child_node = 0;
348 my $new_contains = "";
349 # Get the children of this node
350 my @children = $self->getChildren();
351 # If there are no current children, then this will be the first
352 if (scalar(@children) == 0)
353 {
354 #rint STDERR "ClassifyTreeNode.insertNode: first child!\n";
355 $child_clid = $self->{'clid'} . ".1"; # First child
356 # Contains needs to have this new clid added
357 $new_contains = $child_clid;
358 }
359 # Otherwise search through the current children, looking at their values
360 # to locate where to insert this node.
361 else
362 {
363 #rint STDERR "ClassifyTreeNode.insertNode: searching for position...\n";
364 my $found = 0;
365 my $offset = 1;
366 foreach my $sibling_node (@children)
367 {
368 my $sibling_path = $sibling_node->getPath();
369 # If we are still searching for the insertion point
370 if(!$found)
371 {
372 if($sibling_path->toString() eq $path->toString())
373 {
374 # What?!? This node already exists! why are we adding it again!
375 print STDERR "ClassifyTreeNode.insertNode: what?!? node already exists... how did we get here?\n";
376 return $sibling_node;
377 }
378 elsif($sibling_path->toString() gt $path->toString())
379 {
380 # Found our location!
381 $found = 1;
382 $child_clid = $self->{'clid'} . "." . $offset;
383 # You may notice we haven't added this node to contains.
384 # This is because the parent node already contains this
385 # clid - instead we need to record the new highest clid
386 # created when we move the sibling nodes for here onwards
387 # up one space.
388 #rint STDERR "ClassifyTreeNode.insertNode: found our location: $child_clid \n";
389 last;
390 }
391 }
392 $offset++;
393 }
394 # If we haven't found the node, we insert at the end.
395 if(!$found)
396 {
397 #rint STDERR "ClassifyTreeNode.insertNode not found... insert at end \n";
398 $child_clid = $self->{'clid'} . "." . $offset;
399 # Contains needs to have this new clid added
400 $new_contains = $child_clid;
401 }
402 # If we did find the node, we now have to go through the sibling nodes
403 # shifting them up one CLID to ensure there's space.
404 else
405 {
406 # We need another copy of children, but this time with the last
407 # children first!
408 @children = reverse $self->getChildren();
409 $offset2 = scalar(@children) + 1;
410 foreach my $sibling_node (@children)
411 {
412 $sibling_node->changeCLID($self->{'clid'} . "." . $offset2);
413 # If this if the highest sibling node we are going to rename,
414 # then use it to set the contains metadata.
415 if($new_contains !~ /\w+/)
416 {
417 $new_contains = $self->{'clid'} . "." . $offset2;
418 }
419 # Once we've processed the node exactly in the space the new
420 # node will occupy, we're done.
421 $offset2--;
422 if($offset2 == $offset)
423 {
424 last;
425 }
426 }
427 }
428 }
429 $child_node = new ClassifyTreeNode($self->{'model'}, $child_clid, 1);
430 # Set the value, as this is the only piece of metadata we know and care
431 # about at this stage
432 $child_node->setTitle($path->getLastPathComponent());
433 # Update the contains metadata for this node
434 $contains = $self->getContains();
435 if($contains =~ /\w/)
436 {
437 $contains .= ";" . $new_contains;
438 }
439 else
440 {
441 $contains = $new_contains;
442 }
443 $self->setContains($contains);
444 # And return the node
445 return $child_node;
446 }
447# /** insertNode() **/
448
449# /** Remove all the children of this node and return the number of document
450# * references (leaf nodes) removed by this process.
451# *
452# * @return The count of document references removed as an integer
453# *
454# * @author John Thompson, DL Consulting Ltd
455# */
456sub removeAllNodes()
457 {
458 my ($self) = @_;
459 print STDERR "ClassifyTreeNode.removeAllNodes()\n" unless !$self->{'debug'};
460 my $num_leaf_docs = 0;
461 # Recursively remove this nodes children
462 my @children = $self->getChildren();
463 foreach my $child_node (@children)
464 {
465 $child_node->removeAllNodes();
466 }
467 # Retrieve the document count (leaf docs)
468 my $text = $self->toString();
469 if ($text =~ /<numleafdocs>(\d+)/)
470 {
471 $num_leaf_docs += $1;
472 }
473 # Now remove the node from the database. We do this calling set gdbm with
474 # no value argument.
475 &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'});
476 # Return the leaf count (so we can adjust the numleafdocs at the root node
477 # of this deletion.
478 return $num_leaf_docs;
479 }
480# /** removeAllNodes() **/
481
482# /** Remove the given document this node, and then update the numleafdocs
483# * metadata for all the ancestor nodes.
484# *
485# * @param $oid The unique identifier of a greenstone document
486# *
487# * @author John Thompson, DL Consulting Ltd.
488# */
489sub removeDocument()
490 {
491 my ($self, $oid) = @_;
492 print STDERR "ClassifyTreeNode::removeDocument(\"$oid\")\n" unless !$self->{'debug'};
493 # Retrieve the contains metadata
494 my $contains = $self->getContains();
495 # Remove this oid
496 my @contains_parts = split(/;/, $contains);
497 my @new_contains_parts = ();
498 foreach my $oid_or_clid (@contains_parts)
499 {
500 if ($oid ne $oid_or_clid && $oid_or_clid =~ /[\w\d]+/)
501 {
502 push(@new_contains_parts, $oid_or_clid);
503 }
504 }
505 $contains = join(";", @new_contains_parts);
506 $self->setContains($contains);
507 # We now have to update the numleafdocs count for this node and its
508 # ancestor nodes
509 my $cur_node_obj = $self;
510 while ($cur_node_obj)
511 {
512 my $numleafdocs = $cur_node_obj->getNumLeafDocs();
513 if ($numleafdocs =~ /^\d+$/)
514 {
515 $numleafdocs--;
516 }
517 else
518 {
519 $numleafdocs = 0;
520 }
521 $cur_node_obj->setNumLeafDocs($numleafdocs);
522 $cur_node_obj = $cur_node_obj->getParentNode();
523 }
524 # Done
525 }
526# /** removeDocument() **/
527
528# /** Remove the node denoted by the path.
529# *
530# * @param $child_node The node to be removed
531# *
532# * @author John Thompson, DL Consulting Ltd
533# */
534sub removeNode()
535 {
536 my ($self, $child_node) = @_;
537 # Not as easy as it first sounds as we have to do a recursive remove,
538 # keeping track of any documents removed so we can update document count.
539 # We then remove this node, adjusting the sibling's clid's as necessary
540 # before altering the contains.
541 print STDERR "ClassifyTreeNode::removeNode(child_node)\n" unless !$self->{'debug'};
542 my $remove_clid = $child_node->getCLID();
543 my $sibling_node = $child_node->getNextSibling();
544 # Recursively remove this nodes and its children, taking note of decrease
545 # in document count.
546 my $removed_numleafdocs = $child_node->removeAllNodes();
547 # Determine if removing this node requires other nodes to be moved, and if
548 # so, do so. We do this in a repeating loop until there are no further
549 # siblings, overwriting the $remove_clid variable with the clid of the node
550 # just changed (you'll see why in a moment).
551 while ($sibling_node != 0)
552 {
553 my $current_node = $sibling_node;
554 # Get this nodes sibling
555 $sibling_node = $current_node->getNextSibling();
556 # Record the CLID to change to
557 my $new_clid = $remove_clid;
558 # Record the old clid
559 $remove_clid = $current_node->getCLID();
560 # Modify the clid of the current node
561 $current_node->changeCLID($new_clid);
562 # Continue until there are no further sibling nodes
563 }
564 # By now the $remove_clid will contain the CLID that has to be removed from
565 # the contains metadata for this node
566 my $contains = $self->getContains();
567 my @contains_parts = split(/;/, $contains);
568 my @new_contains_parts = ();
569 foreach my $oid_or_clid (@contains_parts)
570 {
571 if ($remove_clid ne $oid_or_clid && $oid_or_clid =~ /[\w\d]+/)
572 {
573 push(@new_contains_parts, $oid_or_clid);
574 }
575 }
576 $contains = join(";", @new_contains_parts);
577 $self->setContains($contains);
578 # We also alter the numleafdocs metadata to reflect the removal of these
579 # nodes.
580 my $numleafdocs = $self->getNumLeafDocs();
581 if ($numleafdocs =~ /^\d+$/)
582 {
583 $numleafdocs -= $removed_numleafdocs;
584 }
585 else
586 {
587 $numleafdocs = 0;
588 }
589 $self->setNumLeafDocs($numleafdocs);
590 # Done
591 }
592# /** removeNode() **/
593
594# /** Set the contains metadata in the database.
595# *
596# * @param $contains The new contains string
597# *
598# * @author John Thompson, DL Consulting Ltd.
599# */
600sub setContains()
601 {
602 my ($self, $contains) = @_;
603 print STDERR "ClassifyTreeNode::setContains(\"$contains\")\n" unless !$self->{'debug'};
604 # Replace any occurance of this nodes CLID with "
605 $contains =~ s/$self->{'clid'}/\"/g;
606 # Load the text of this node
607 my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
608 # Replace the contains
609 #rint STDERR "Before: $text\n";
610 $text =~ s/<contains>.*?\n+/<contains>$contains\n/;
611 #rint STDERR "After: $text\n";
612 # Store the changed text
613 &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'}, $text);
614 }
615# /** setContains() **/
616
617# /** Set the numleafdocs metadata in the database.
618# *
619# * @param $numleafdocs The new count of leaf documents
620# *
621# * @author John Thompson, DL Consulting Ltd.
622# */
623sub setNumLeafDocs()
624 {
625 my ($self, $numleafdocs) = @_;
626 print STDERR "ClassifyTreeNode::setNumLeafDocs(numleafdocs)\n" unless !$self->{'debug'};
627 # Load the text of this node
628 my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
629 # Replace the numleafdocs
630 $text =~ s/<numleafdocs>\d*?\n+/<numleafdocs>$numleafdocs\n/;
631 # Store the changed text
632 &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'}, $text);
633 }
634# /** setNumLeafDocs() **/
635
636# /** Set the title metadata in the database.
637# * Note: Previously this was value and we extracted the title, but the new
638# * autohierarchies don't set values.
639# *
640# * @param $title The new title string
641# *
642# * @author John Thompson, DL Consulting Ltd.
643# */
644sub setTitle()
645 {
646 my ($self, $title) = @_;
647 print STDERR "ClassifyTreeNode::setTitle(\"$title\")\n" unless !$self->{'debug'};
648 # Load the text of this node
649 my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
650 # Replace the title
651 $text =~ s/<Title>.*?\n+/<Title>$title\n/;
652 # Store the changed text
653 &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'}, $text);
654 }
655# /** setValue() **/
656
657# /** Represent this node as a string.
658# *
659# * @return The string representation of this node
660# *
661# * @author John Thompson, DL Consulting Ltd.
662# */
663sub toString()
664 {
665 my ($self) = @_;
666 print STDERR "ClassifyTreeNode::toString()\n" unless !$self->{'debug'};
667 my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
668 return $text;
669 }
670# /** toString() **/
671
6721;
Note: See TracBrowser for help on using the repository browser.