source: gsdl/trunk/perllib/ClassifyTreeNode.pm@ 18379

Last change on this file since 18379 was 17087, checked in by davidb, 16 years ago

Introduction of new GDBM alternative for archives.inf as step towards full incremental building. Information traditionally stored in archives.inf PLUS additional information that will help with working out what files have changed since last build, and what doc-id they hashed to is stored in two GDBM databases. For now these databases aren't read, but in the future ArchivesInfPlugin will be upgraded to use these to support these.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.3 KB
Line 
1package ClassifyTreeNode;
2
3use ClassifyTreeNode;
4use ClassifyTreePath;
5use GDBMUtils;
6use strict;
7
8# /** Constructor
9# *
10# *
11# * @author John Thompson, DL Consulting Ltd.
12# */
13sub new()
14 {
15 my ($class, $model, $clid, $force_new) = @_;
16 my $debug = 0;
17 $force_new = 0 unless defined($force_new);
18 print STDERR "ClassifyTreeNode.new(model, \"$clid\", $force_new)\n" unless !$debug;
19 $force_new = 0 unless defined($force_new);
20 # Test the parameters
21 die("Can't create a tree node that doesn't belong to a tree model!") unless $model;
22 die("Can't create a tree node that doesn't have a unique id (OID)!") unless $clid;
23 # Store the variables
24 my $self = {};
25 $self->{'debug'} = $debug;
26 $self->{'model'} = $model;
27 $self->{'clid'} = $clid;
28
29 my $collection = $model->getCollection();
30
31 # Check if this node already exists in the database, and if not insert it
32 # now
33 my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
34 if($text !~ /\w+/ && $force_new)
35 {
36 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid,
37 "<doctype>classify\n<hastxt>0\n<childtype>VList\n<Title>\n<numleafdocs>0\n<contains>\n");
38 }
39 # Bless me father for I have sinned
40 bless $self, $class;
41 return $self;
42 }
43# /** new() **/
44
45# /** Add a document to this tree node.
46# *
47# * @param $oid The unique identifier of the document to add
48# *
49# * @author John Thompson, DL Consulting Ltd
50# */
51sub addDocument()
52 {
53 my ($self, $oid) = @_;
54 print STDERR "ClassifyTreeNode.addDocument(\"$oid\")\n" unless !$self->{'debug'};
55 # Get the current contains list
56 my $contains = $self->getContains();
57 # See whether this document already exists in the contains
58 if ($contains !~ /(^$oid$|^$oid;$|;$oid;|;$oid$)/)
59 {
60 # If not, append to the contains list
61 if ($contains ne "")
62 {
63 $contains .= ";$oid";
64 }
65 else
66 {
67 $contains = $oid;
68 }
69 # Store the changed contains
70 $self->setContains($contains);
71 # We now have to update the numleafdocs count for this node and its
72 # ancestor nodes
73 my $cur_node_obj = $self;
74 while ($cur_node_obj)
75 {
76 my $numleafdocs = $cur_node_obj->getNumLeafDocs();
77 if ($numleafdocs =~ /^\d+$/)
78 {
79 $numleafdocs ++;
80 }
81 else
82 {
83 $numleafdocs = 1;
84 }
85 $cur_node_obj->setNumLeafDocs($numleafdocs);
86 $cur_node_obj = $cur_node_obj->getParentNode();
87 }
88 }
89 else
90 {
91 print STDERR "Document already exists!\n" unless !$self->{'debug'};
92 }
93 }
94# /** addDocument() **/
95
96# /** Changes the CLID of a particular node. Note that this is significantly
97# * tricky step, as we have to remove the old node from the database, and
98# * then readd with the corrected CLID.
99# *
100# * @param $clid The new CLID as an integer
101# *
102# * @author John Thompson, DL Consulting Ltd.
103# */
104sub changeCLID()
105 {
106 my($self, $new_clid) = @_;
107 print STDERR "ClassifyTreeNode.changeCLID(\"$new_clid\")\n" unless !$self->{'debug'};
108 # Store the current clid for later use
109 my $old_clid = $self->{'clid'};
110 # And record the children now, as they'll change after we shift the parent
111 # CLID
112 my @child_nodes = $self->getChildren();
113
114 # Retrieve the current document
115 my $text = $self->toString();
116
117 my $collection = $self->{'model'}->getCollection();
118
119 # Create a new document with the correct CLID
120 &GDBMUtils::gdbmCachedCollectionSet($collection, $new_clid, $text);
121
122 # Remove the old document
123 &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'clid'});
124
125 # Finally, change the clid stored in this document
126 $self->{'clid'} = $new_clid;
127
128 # Now go through this nodes children, and shift them too
129 foreach my $child_node (@child_nodes)
130 {
131 # We determine the new clid by retrieving the childs current clid,
132 # and then replacing any occurance to the parents old clid with the
133 # parents new clid
134 my $old_child_clid = $child_node->getCLID();
135 #rint STDERR "* considering: " . $old_child_clid . "\n";
136 if($old_child_clid =~ /^CL/)
137 {
138 my $new_child_clid = $new_clid . substr($old_child_clid, length($old_clid));
139 #rint STDERR "* shifting child $old_child_clid to $new_child_clid\n";
140 $child_node->changeCLID($new_child_clid);
141 }
142 }
143 }
144# /** changeCLID() **/
145
146
147# /** Retrieve the unique id for this classifier.
148# *
149# * @return The CLID as a string
150# *
151# * @author John Thompson, DL Consulting Ltd.
152# */
153sub getCLID()
154 {
155 my ($self) = @_;
156 print STDERR "ClassifyTreeNode.getCLID()\n" unless !$self->{'debug'};
157 return $self->{'clid'};
158 }
159
160# /** Return the child objects of this node an as array.
161# *
162# * @return An array of node objects
163# *
164# * @author John Thompson, DL Consulting Ltd.
165# */
166sub getChildren()
167 {
168 my ($self) = @_;
169 print STDERR "ClassifyTreeNode.getChildren()\n" unless !$self->{'debug'};
170 my $text = $self->toString();
171 my @children = ();
172 # Retrieve the contains metadata item
173 if($text =~ /<contains>(.*?)\r?\n/)
174 {
175 #rint STDERR "* children formed from contains: $1\n";
176 my $contains_raw = $1;
177 my @contains = split(/;/, $contains_raw);
178 foreach my $child_clid (@contains)
179 {
180 # Replace the " with the parent clid
181 $child_clid =~ s/\"/$self->{'clid'}/;
182 # Create the node obj
183 my $child_node_obj = new ClassifyTreeNode($self->{'model'}, $child_clid);
184 # And insert into ever growing array of child nodes
185 push(@children, $child_node_obj);
186 }
187 }
188 return @children;
189 }
190# /** getChildren() **/
191
192# /** Retrieve the contains metadata which is used to determine this nodes
193# * children.
194# *
195# * @return The contains metadata as a string
196# *
197# * @author John Thompson, DL Consulting Ltd.
198# */
199sub getContains()
200 {
201 my ($self) = @_;
202 print STDERR "ClassifyTreeNode.getContains()\n" unless !$self->{'debug'};
203 my $result = 0;
204 my $text = $self->toString();
205 if($text =~ /<contains>(.*?)\r?\n/)
206 {
207 $result = $1;
208 # Replace " with this nodes CLID
209 $result =~ s/\"/$self->{'clid'}/g;
210 }
211 return $result;
212 }
213# /** getContains() **/
214
215# /** Retrieve this nodes next sibling.
216# *
217# * @return The next sibling node object or 0 if no such node
218# *
219# * @author John Thompson, DL Consulting Ltd.
220# */
221sub getNextSibling()
222 {
223 my ($self) = @_;
224 print STDERR "ClassifyTreeNode.getNextSibling()\n" unless !$self->{'debug'};
225 my $sibling_node = 0;
226 # The next sibling would be the node identified by the CLID with its
227 # suffix number one greater than this nodes CLID.
228 my @clid_parts = split(/\./, $self->{'clid'});
229 my $suffix = pop(@clid_parts);
230 $suffix++;
231 push(@clid_parts, $suffix);
232 my $next_clid = join(".", @clid_parts);
233
234 my $collection = $self->{'model'}->getCollection();
235
236 # Now determine if this node exists.
237 if(&GDBMUtils::gdbmCachedCollectionGet($collection, $next_clid) =~ /\w+/)
238 {
239 # And if so, create it.
240 $sibling_node = new ClassifyTreeNode($self->{'model'}, $next_clid);
241 }
242 # Done
243 return $sibling_node;
244 }
245# /** getNextSibling() **/
246
247# /** Retrieve the numleafdocs metadata which if affected by any changes to
248# * child nodes.
249# *
250# * @return The numleafdocs as an integer
251# *
252# * @author John Thompson, DL Consulting Ltd.
253# */
254sub getNumLeafDocs()
255 {
256 my ($self) = @_;
257 print STDERR "ClassifyTreeNode.getNumLeafDocs()\n" unless !$self->{'debug'};
258 my $result = 0;
259 my $text = $self->toString();
260 if($text =~ /<numleafdocs>(\d*?)\r?\n/)
261 {
262 $result = $1;
263 }
264 return $result;
265 }
266# /** getNumLeafDocs() **/
267
268# /** Retrieve the parent node of the given node.
269# *
270# * @param $child_node The node whose parent we want to retrieve
271# * @return The parent node, or 0 if this is the root
272# *
273# * @author John Thompson, DL Consulting Ltd.
274# */
275sub getParentNode()
276 {
277 my ($self) = @_;
278 print STDERR "ClassifyTreeNode.getParentNode()\n" unless !$self->{'debug'};
279 my $parent_node = 0;
280 my $child_clid = $self->getCLID();
281 my @clid_parts = split(/\./, $child_clid);
282 if(scalar(@clid_parts) > 1)
283 {
284 pop(@clid_parts);
285 my $parent_clid = join(".", @clid_parts);
286 $parent_node = $self->{'model'}->getNodeByCLID($parent_clid);
287 }
288 # Otherwise we are already at the root node
289 return $parent_node;
290 }
291# /** getParentNode() **/
292
293# /** Retrieve the path to this node.
294# *
295# * @return The path obj which represents the path to this node or 0 if no
296# * path information exists
297# *
298# * @author John Thompson, DL Consulting Ltd.
299# */
300sub getPath()
301 {
302 my ($self) = @_;
303 print STDERR "ClassifyTreeNode.getPath()\n" unless !$self->{'debug'};
304 my $result = 0;
305 my $text = $self->toString();
306 if($text =~ /<Title>(.*?)\r?\n/ )
307 {
308 my $this_component = $1;
309 # If this node has a parent, then retrieve its path
310 my $parent_node = $self->getParentNode();
311 if ($parent_node)
312 {
313 # Get the path...
314 $result = $parent_node->getPath();
315 # ... and add our component
316 $result->addPathComponent($this_component);
317 }
318 else
319 {
320 $result = new ClassifyTreePath($this_component);
321 }
322 }
323 return $result;
324 }
325# /** getPath() **/
326
327# /** Retrieve the title of this node. This returns essentially the same
328# * information as getPath, but without the encapsulating object.
329# *
330# * @return The title as a string
331# *
332# * @author John Thompson, DL Consulting Ltd.
333# */
334sub getTitle()
335 {
336 my ($self) = @_;
337 print STDERR "ClassifyTreeNode.getTitle()\n" unless !$self->{'debug'};
338 my $result = 0;
339 my $text = $self->toString();
340 if($text =~ /<Title>(.*?)\r?\n/)
341 {
342 $result = $1;
343 }
344 return $result;
345 }
346# /** getTitle() **/
347
348# /** Using the given value locate the correct position to insert a new node,
349# * create it, and then establish it in the database.
350# *
351# * @param $path The path used to determine where to insert node as a string
352# * @return The newly inserted node object
353# *
354# * @author John Thompson, DL Consulting Ltd.
355# */
356sub insertNode()
357 {
358 my ($self, $path) = @_;
359 print STDERR "ClassifyTreeNode.insertNode(\"$path\")\n" unless !$self->{'debug'};
360 my $child_clid = "";
361 my $child_node = 0;
362 my $new_contains = "";
363 # Get the children of this node
364 my @children = $self->getChildren();
365 # If there are no current children, then this will be the first
366 if (scalar(@children) == 0)
367 {
368 #rint STDERR "ClassifyTreeNode.insertNode: first child!\n";
369 $child_clid = $self->{'clid'} . ".1"; # First child
370 # Contains needs to have this new clid added
371 $new_contains = $child_clid;
372 }
373 # Otherwise search through the current children, looking at their values
374 # to locate where to insert this node.
375 else
376 {
377 #rint STDERR "ClassifyTreeNode.insertNode: searching for position...\n";
378 my $found = 0;
379 my $offset = 1;
380 foreach my $sibling_node (@children)
381 {
382 my $sibling_path = $sibling_node->getPath();
383 # If we are still searching for the insertion point
384 if(!$found)
385 {
386 if($sibling_path->toString() eq $path->toString())
387 {
388 # What?!? This node already exists! why are we adding it again!
389 print STDERR "ClassifyTreeNode.insertNode: what?!? node already exists... how did we get here?\n";
390 return $sibling_node;
391 }
392 elsif($sibling_path->toString() gt $path->toString())
393 {
394 # Found our location!
395 $found = 1;
396 $child_clid = $self->{'clid'} . "." . $offset;
397 # You may notice we haven't added this node to contains.
398 # This is because the parent node already contains this
399 # clid - instead we need to record the new highest clid
400 # created when we move the sibling nodes for here onwards
401 # up one space.
402 #rint STDERR "ClassifyTreeNode.insertNode: found our location: $child_clid \n";
403 last;
404 }
405 }
406 $offset++;
407 }
408 # If we haven't found the node, we insert at the end.
409 if(!$found)
410 {
411 #rint STDERR "ClassifyTreeNode.insertNode not found... insert at end \n";
412 $child_clid = $self->{'clid'} . "." . $offset;
413 # Contains needs to have this new clid added
414 $new_contains = $child_clid;
415 }
416 # If we did find the node, we now have to go through the sibling nodes
417 # shifting them up one CLID to ensure there's space.
418 else
419 {
420 # We need another copy of children, but this time with the last
421 # children first!
422 @children = reverse $self->getChildren();
423 my $offset2 = scalar(@children) + 1;
424 foreach my $sibling_node (@children)
425 {
426 $sibling_node->changeCLID($self->{'clid'} . "." . $offset2);
427 # If this if the highest sibling node we are going to rename,
428 # then use it to set the contains metadata.
429 if($new_contains !~ /\w+/)
430 {
431 $new_contains = $self->{'clid'} . "." . $offset2;
432 }
433 # Once we've processed the node exactly in the space the new
434 # node will occupy, we're done.
435 $offset2--;
436 if($offset2 == $offset)
437 {
438 last;
439 }
440 }
441 }
442 }
443 $child_node = new ClassifyTreeNode($self->{'model'}, $child_clid, 1);
444 # Set the value, as this is the only piece of metadata we know and care
445 # about at this stage
446 $child_node->setTitle($path->getLastPathComponent());
447 # Update the contains metadata for this node
448 my $contains = $self->getContains();
449 if($contains =~ /\w/)
450 {
451 $contains .= ";" . $new_contains;
452 }
453 else
454 {
455 $contains = $new_contains;
456 }
457 $self->setContains($contains);
458 # And return the node
459 return $child_node;
460 }
461# /** insertNode() **/
462
463# /** Remove all the children of this node and return the number of document
464# * references (leaf nodes) removed by this process.
465# *
466# * @return The count of document references removed as an integer
467# *
468# * @author John Thompson, DL Consulting Ltd
469# */
470sub removeAllNodes()
471 {
472 my ($self) = @_;
473 print STDERR "ClassifyTreeNode.removeAllNodes()\n" unless !$self->{'debug'};
474 my $num_leaf_docs = 0;
475 # Recursively remove this nodes children
476 my @children = $self->getChildren();
477 foreach my $child_node (@children)
478 {
479 $child_node->removeAllNodes();
480 }
481 # Retrieve the document count (leaf docs)
482 my $text = $self->toString();
483 if ($text =~ /<numleafdocs>(\d+)/)
484 {
485 $num_leaf_docs += $1;
486 }
487 # Now remove the node from the database. We do this calling set gdbm with
488 # no value argument.
489 my $collection = $self->{'model'}->getCollection();
490 &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'clid'});
491
492 # Return the leaf count (so we can adjust the numleafdocs at the root node
493 # of this deletion.
494 return $num_leaf_docs;
495 }
496# /** removeAllNodes() **/
497
498# /** Remove the given document this node, and then update the numleafdocs
499# * metadata for all the ancestor nodes.
500# *
501# * @param $oid The unique identifier of a greenstone document
502# *
503# * @author John Thompson, DL Consulting Ltd.
504# */
505sub removeDocument()
506 {
507 my ($self, $oid) = @_;
508 print STDERR "ClassifyTreeNode::removeDocument(\"$oid\")\n" unless !$self->{'debug'};
509 # Retrieve the contains metadata
510 my $contains = $self->getContains();
511 # Remove this oid
512 my @contains_parts = split(/;/, $contains);
513 my @new_contains_parts = ();
514 foreach my $oid_or_clid (@contains_parts)
515 {
516 if ($oid ne $oid_or_clid && $oid_or_clid =~ /[\w\d]+/)
517 {
518 push(@new_contains_parts, $oid_or_clid);
519 }
520 }
521 $contains = join(";", @new_contains_parts);
522 $self->setContains($contains);
523 # We now have to update the numleafdocs count for this node and its
524 # ancestor nodes
525 my $cur_node_obj = $self;
526 while ($cur_node_obj)
527 {
528 my $numleafdocs = $cur_node_obj->getNumLeafDocs();
529 if ($numleafdocs =~ /^\d+$/)
530 {
531 $numleafdocs--;
532 }
533 else
534 {
535 $numleafdocs = 0;
536 }
537 $cur_node_obj->setNumLeafDocs($numleafdocs);
538 $cur_node_obj = $cur_node_obj->getParentNode();
539 }
540 # Done
541 }
542# /** removeDocument() **/
543
544# /** Remove the node denoted by the path.
545# *
546# * @param $child_node The node to be removed
547# *
548# * @author John Thompson, DL Consulting Ltd
549# */
550sub removeNode()
551 {
552 my ($self, $child_node) = @_;
553 # Not as easy as it first sounds as we have to do a recursive remove,
554 # keeping track of any documents removed so we can update document count.
555 # We then remove this node, adjusting the sibling's clid's as necessary
556 # before altering the contains.
557 print STDERR "ClassifyTreeNode::removeNode(child_node)\n" unless !$self->{'debug'};
558 my $remove_clid = $child_node->getCLID();
559 my $sibling_node = $child_node->getNextSibling();
560 # Recursively remove this nodes and its children, taking note of decrease
561 # in document count.
562 my $removed_numleafdocs = $child_node->removeAllNodes();
563 # Determine if removing this node requires other nodes to be moved, and if
564 # so, do so. We do this in a repeating loop until there are no further
565 # siblings, overwriting the $remove_clid variable with the clid of the node
566 # just changed (you'll see why in a moment).
567 while ($sibling_node != 0)
568 {
569 my $current_node = $sibling_node;
570 # Get this nodes sibling
571 $sibling_node = $current_node->getNextSibling();
572 # Record the CLID to change to
573 my $new_clid = $remove_clid;
574 # Record the old clid
575 $remove_clid = $current_node->getCLID();
576 # Modify the clid of the current node
577 $current_node->changeCLID($new_clid);
578 # Continue until there are no further sibling nodes
579 }
580 # By now the $remove_clid will contain the CLID that has to be removed from
581 # the contains metadata for this node
582 my $contains = $self->getContains();
583 my @contains_parts = split(/;/, $contains);
584 my @new_contains_parts = ();
585 foreach my $oid_or_clid (@contains_parts)
586 {
587 if ($remove_clid ne $oid_or_clid && $oid_or_clid =~ /[\w\d]+/)
588 {
589 push(@new_contains_parts, $oid_or_clid);
590 }
591 }
592 $contains = join(";", @new_contains_parts);
593 $self->setContains($contains);
594 # We also alter the numleafdocs metadata to reflect the removal of these
595 # nodes.
596 my $numleafdocs = $self->getNumLeafDocs();
597 if ($numleafdocs =~ /^\d+$/)
598 {
599 $numleafdocs -= $removed_numleafdocs;
600 }
601 else
602 {
603 $numleafdocs = 0;
604 }
605 $self->setNumLeafDocs($numleafdocs);
606 # Done
607 }
608# /** removeNode() **/
609
610# /** Set the contains metadata in the database.
611# *
612# * @param $contains The new contains string
613# *
614# * @author John Thompson, DL Consulting Ltd.
615# */
616sub setContains()
617 {
618 my ($self, $contains) = @_;
619 print STDERR "ClassifyTreeNode::setContains(\"$contains\")\n" unless !$self->{'debug'};
620 # Replace any occurance of this nodes CLID with "
621 $contains =~ s/$self->{'clid'}/\"/g;
622
623 my $collection = $self->{'model'}->getCollection();
624 my $clid = $self->{'clid'};
625
626 # Load the text of this node
627 my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
628
629 # Replace the contains
630 #rint STDERR "Before: $text\n";
631 $text =~ s/<contains>.*?\n+/<contains>$contains\n/;
632 #rint STDERR "After: $text\n";
633 # Store the changed text
634 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
635 }
636# /** setContains() **/
637
638# /** Set the numleafdocs metadata in the database.
639# *
640# * @param $numleafdocs The new count of leaf documents
641# *
642# * @author John Thompson, DL Consulting Ltd.
643# */
644sub setNumLeafDocs()
645 {
646 my ($self, $numleafdocs) = @_;
647 print STDERR "ClassifyTreeNode::setNumLeafDocs(numleafdocs)\n" unless !$self->{'debug'};
648
649 my $collection = $self->{'model'}->getCollection();
650 my $clid = $self->{'clid'};
651
652 # Load the text of this node
653 my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
654 # Replace the numleafdocs
655 $text =~ s/<numleafdocs>\d*?\n+/<numleafdocs>$numleafdocs\n/;
656 # Store the changed text
657 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
658 }
659# /** setNumLeafDocs() **/
660
661# /** Set the title metadata in the database.
662# * Note: Previously this was value and we extracted the title, but the new
663# * autohierarchies don't set values.
664# *
665# * @param $title The new title string
666# *
667# * @author John Thompson, DL Consulting Ltd.
668# */
669sub setTitle()
670 {
671 my ($self, $title) = @_;
672 print STDERR "ClassifyTreeNode::setTitle(\"$title\")\n" unless !$self->{'debug'};
673
674 my $collection = $self->{'model'}->getCollection();
675 my $clid = $self->{'clid'};
676
677 # Load the text of this node
678 my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
679 # Replace the title
680 $text =~ s/<Title>.*?\n+/<Title>$title\n/;
681 # Store the changed text
682 &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
683 }
684# /** setValue() **/
685
686# /** Represent this node as a string.
687# *
688# * @return The string representation of this node
689# *
690# * @author John Thompson, DL Consulting Ltd.
691# */
692sub toString()
693 {
694 my ($self) = @_;
695 print STDERR "ClassifyTreeNode::toString()\n" unless !$self->{'debug'};
696 my $collection = $self->{'model'}->getCollection();
697 my $clid = $self->{'clid'};
698
699 my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
700 return $text;
701 }
702# /** toString() **/
703
7041;
Note: See TracBrowser for help on using the repository browser.