source: main/trunk/greenstone2/perllib/classify/Hierarchy.pm@ 23116

Last change on this file since 23116 was 23116, checked in by kjdon, 14 years ago

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1###########################################################################
2#
3# Hierarchy.pm -- classifier that enables a Hierarchy to beformed without
4# the need for a hierarchy file (like HFileHierarchy). Used
5# to be called AutoHierarchy. Inherits from HFileHierarchy
6# so can also do everything that does as well.
7# Created by Imene, modified by Katherine and David.
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# Copyright (C) 1999 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29###########################################################################
30
31# An advanced Hierarchical classifier
32# to see the options, run "perl -S classinfo.pl Hierarchy"
33
34package Hierarchy;
35
36use HFileHierarchy;
37use sorttools;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42sub BEGIN {
43 @Hierarchy::ISA = ('HFileHierarchy');
44}
45
46my $arguments =
47 [ { 'name' => "separator",
48 'desc' => "{Hierarchy.separator}",
49 'type' => "regexp",
50 'deft' => "[\\\\\\\/|\\\\\\\|]",
51 'reqd' => "no" },
52 { 'name' => "suppresslastlevel",
53 'desc' => "{Hierarchy.suppresslastlevel}",
54 'type' => "flag",
55 'reqd' => "no" },
56 { 'name' => "suppressfirstlevel",
57 'desc' => "{Hierarchy.suppressfirstlevel}",
58 'type' => "flag",
59 'reqd' => "no" }
60 ];
61
62my $options = { 'name' => "Hierarchy",
63 'desc' => "{Hierarchy.desc}",
64 'abstract' => "no",
65 'inherits' => "yes",
66 'args' => $arguments };
67
68
69sub new {
70 my ($class) = shift (@_);
71 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
72 push(@$classifierslist, $class);
73
74 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
75 push(@{$hashArgOptLists->{"OptList"}},$options);
76
77 my $self = new HFileHierarchy($classifierslist, $inputargs, $hashArgOptLists);
78
79 # the hash that we use to build up the hierarchy
80 $self->{'path_hash'}= {};
81
82 return bless $self, $class;
83}
84
85
86sub auto_classify {
87 my $self = shift (@_);
88 my ($doc_obj,$nosort,$sortmeta,$metavalues) = @_;
89
90 my $doc_OID = $doc_obj->get_OID();
91
92 #Add all the metadata values to the hash
93 my $path_hash;
94 my $current_pos;
95
96
97 foreach my $metavalue (@$metavalues) {
98 $path_hash = $self->{'path_hash'};
99 my @chunks = split (/$self->{'separator'}/, $metavalue);
100 if ($self->{'suppresslastlevel'}) {
101 pop(@chunks); # remove the last element from the end
102 }
103 if ($self->{'suppressfirstlevel'}) {
104 shift(@chunks);
105 }
106 foreach my $folderName (@chunks)
107 {
108 # Removing leading and trailing spaces
109 $folderName =~ s/^(\s+)//;
110 $folderName =~ s/(\s+)$//;
111 if ($folderName ne ""){ #sometimes the tokens are empty
112 $current_pos = $self->add_To_Hash($path_hash, $folderName, $nosort);
113 $path_hash = $current_pos->{'nodes'};
114 }
115 }
116 # now add the document, with sort meta if needed
117 if ($nosort) {
118 push(@{$current_pos->{'docs'}}, $doc_OID);
119 } else {
120 $current_pos->{'docs'}->{$doc_OID} = $sortmeta;
121
122 #if (defined $sortmeta) {
123 # # can you ever get the same doc twice in one classification??
124 # $current_pos->{'docs'}->{$doc_OID} = $sortmeta;
125 # } else {
126 # $current_pos->{'docs'}->{$doc_OID} = $metavalue;
127 # }
128 }
129 } # foreach metadata
130
131}
132
133sub classify {
134 my $self = shift (@_);
135 my ($doc_obj) = @_;
136
137 my $doc_OID = $doc_obj->get_OID();
138
139 # are we sorting the list??
140 my $nosort = 0;
141 if (!defined $self->{'sort'}) {
142 $nosort = 1;
143 }
144
145 my $metavalues = [];
146 # find all the metadata values
147 foreach my $m (@{$self->{'meta_list'}}) {
148 my $mvalues = $doc_obj->get_metadata($doc_obj->get_top_section(), $m);
149 next unless (@{$mvalues});
150 if ($self->{'firstvalueonly'}) {
151 # we only want the first metadata value
152 push (@$metavalues, $mvalues->[0]);
153 last;
154 }
155 push (@$metavalues, @$mvalues);
156 last if (!$self->{'allvalues'}); # we don't want to try other elements
157 # cos we have already found some
158 }
159
160 return unless (@$metavalues);
161
162 #check for a sort element other than our metadata
163 my $sortmeta = undef;
164 if (!$nosort) {
165 if ($self->{'sort'} =~ /^filename$/i) {
166 $sortmeta = $doc_obj->get_source_filename();
167 } else {
168 $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'sort'});
169 if (defined $sortmeta && !$self->{'no_metadata_formatting'}) {
170 $sortmeta = &sorttools::format_metadata_for_sorting($self->{'sort'}, $sortmeta, $doc_obj);
171 }
172 }
173 $sortmeta = "" unless defined $sortmeta;
174 }
175
176 if (defined $self->{'subjectfile'}) {
177 $self->hfile_classify($doc_obj,$sortmeta,$metavalues);
178 }
179 else {
180 $self->auto_classify($doc_obj,$nosort,$sortmeta,$metavalues);
181 }
182}
183
184sub add_To_Hash {
185 my $self = shift (@_);
186 my ($myhash, $k, $nosort) = @_;
187
188 if (!defined $myhash->{$k}){
189 $myhash->{$k}={};
190 $myhash->{$k}->{'nodes'}={};
191 if ($nosort) {
192 $myhash->{$k}->{'docs'}=[];
193 } else {
194 $myhash->{$k}->{'docs'} = {};
195 }
196 }
197 return $myhash->{$k};
198}
199
200sub print_Hash{
201 my $self = shift (@_);
202 my ($myHash, $num_spaces) = @_;
203
204 foreach my $key (keys %{$myHash}){
205 print "\n";
206 $self->print_spaces($num_spaces);
207 print STDERR "$key*";
208 $self->print_Hash($myHash->{$key}, $num_spaces + 2);
209 }
210}
211
212sub print_spaces{
213 my $self = shift (@_);
214 my ($num_spaces) = @_;
215
216 for (my $i = 0; $i < $num_spaces; $i++){
217 print STDERR " ";
218 }
219}
220
221sub get_entry {
222 my $self = shift (@_);
223 my ($title, $childtype, $thistype) = @_;
224
225 # organise into classification structure
226 my %classifyinfo = ('childtype'=>$childtype,
227 'Title'=>$title,
228 'contains'=>[],);
229 $classifyinfo{'thistype'} = $thistype
230 if defined $thistype && $thistype =~ /\w/;
231
232 return \%classifyinfo;
233}
234
235sub process_hash {
236 my $self = shift (@_);
237 my ($top_hash, $top_entry) = @_;
238 my ($entry);
239
240 my $hash = {};
241 foreach my $key (sort keys %{$top_hash}) {
242 $entry = $self->get_entry($key,"VList","VList");
243 my $has_content = 0;
244 my @doc_list;
245 # generate a sorted list of doc ids
246 if (not (defined ($self->{'sort'})) && scalar(@{$top_hash->{$key}->{'docs'}})) {
247 @doc_list = @{$top_hash->{$key}->{'docs'}};
248 } elsif (defined ($self->{'sort'}) && (keys %{$top_hash->{$key}->{'docs'}})) {
249 @doc_list = sort {$top_hash->{$key}->{'docs'}->{$a}
250 cmp $top_hash->{$key}->{'docs'}->{$b};} keys %{$top_hash->{$key}->{'docs'}};
251
252 }
253
254 if ($self->{'documents_last'}) {
255 # add nodes, then documents
256 # if this key has nodes, add them
257 if (scalar(keys %{$top_hash->{$key}->{'nodes'}})) {
258 $has_content = 1;
259 $self->process_hash($top_hash->{$key}->{'nodes'}, $entry);
260 }
261
262 # if this key has documents, add them
263 if (@doc_list) {
264 $has_content = 1;
265 foreach my $d (@doc_list) {
266 push (@{$entry->{'contains'}}, {'OID'=>$d});
267 }
268 }
269
270 } else {
271 # add documents then nodes
272 # if this key has documents, add them
273 if (@doc_list) {
274 $has_content = 1;
275 foreach my $d (@doc_list) {
276 push (@{$entry->{'contains'}}, {'OID'=>$d});
277 }
278 }
279 # if this key has nodes, add them
280 if (scalar(keys %{$top_hash->{$key}->{'nodes'}})) {
281 $has_content = 1;
282 $self->process_hash($top_hash->{$key}->{'nodes'}, $entry);
283 }
284 }
285
286 # if we have found some content, add the new entry for this key into the parent node
287 if ($has_content) {
288 push (@{$top_entry->{'contains'}}, $entry);
289 }
290
291 }
292}
293
294sub auto_get_classify_info {
295 my $self = shift (@_);
296 my ($no_thistype) = @_;
297 $no_thistype = 0 unless defined $no_thistype;
298
299 my ($classification);
300 my $top_h = $self->{'path_hash'};
301
302 if ($self->{'path_hash'}) {
303 if ($self->{'hlist_at_top'}) {
304 $classification = $self->get_entry ($self->{'buttonname'}, "HList", "Invisible");
305 }
306 else {
307 $classification = $self->get_entry ($self->{'buttonname'}, "VList", "Invisible");
308 }
309 }
310
311 $self->process_hash($top_h, $classification);
312
313 return $classification;
314
315}
316
317sub auto_get_classify_info
318{
319 my $self = shift (@_);
320 my ($classifyinfo) = @_;
321
322 $self->process_hash($self->{'path_hash'}, $classifyinfo);
323
324 return $classifyinfo;
325}
326
327
328sub get_classify_info {
329 my $self = shift (@_);
330
331 my ($classifyinfo);
332
333 if ($self->{'hlist_at_top'}) {
334 $classifyinfo = $self->get_entry ($self->{'buttonname'}, "HList", "Invisible");
335 }
336 else {
337 $classifyinfo = $self->get_entry ($self->{'buttonname'}, "VList", "Invisible");
338 }
339
340 if (defined $self->{'subjectfile'}) {
341 return $self->hfile_get_classify_info($classifyinfo);
342 }
343 else {
344 return $self->auto_get_classify_info($classifyinfo);
345 }
346}
347
348
3491;
350
351
352
Note: See TracBrowser for help on using the repository browser.