source: trunk/gsdl/perllib/classify/Hierarchy.pm@ 9206

Last change on this file since 9206 was 9206, checked in by davidb, 19 years ago

Reworking of AutoHierarchy and Hierarchy so they are merged as one "super"
hierarchy classifier that is backward compatible.

The original Hierarchy is now HFileHierarchy. This is now an abstract
class that encapsulates everything needed to use the -hfile option.

AutoHierarchy has now been renamed to Hierarchy. Classifiers options between
the two have been merged and kept backwards compatible. If a user specifies
-hfile to Hierarhcy it is patched through to HFileHierarchy to do things
the way they used to be done. If no -hfile flag is specified then the
newer "auto" ability is used.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.8 KB
Line 
1###########################################################################
2#
3# Hierarchy.pm -- classifier that enables a Hierarchy to beformed without
4# the need for a hierarchy file (like HFileHierarchy). Used
5# to be called AutoHierarchy. Inherits from HFileHierarchy
6# so can also do everything that does as well.
7# Created by Imene, modified by Katherine and David.
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# Copyright (C) 1999 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29###########################################################################
30
31# An advanced Hierarchical classifier
32# to see the options, run "perl -S classinfo.pl Hierarchy"
33
34package Hierarchy;
35
36use HFileHierarchy;
37use sorttools;
38
39sub BEGIN {
40 @ISA = ('HFileHierarchy');
41}
42
43my $arguments =
44 [ { 'name' => "separator",
45 'desc' => "{AutoHierarchy.separator}",
46 'type' => "regexp",
47 'deft' => "[\\\\\\\/|\\\\\\\|]",
48 'reqd' => "no" },
49 { 'name' => "suppresslastlevel",
50 'desc' => "{AutoHierarchy.suppresslastlevel}",
51 'type' => "flag",
52 'reqd' => "no" } ];
53
54my $options = { 'name' => "Hierarchy",
55 'desc' => "{AutoHierarchy.desc}",
56 'abstract' => "no",
57 'inherits' => "yes",
58 'args' => $arguments };
59
60
61sub new {
62 my $class = shift (@_);
63 my $self = new HFileHierarchy($class, @_);
64
65 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
66 my $option_list = $self->{'option_list'};
67 push( @{$option_list}, $options );
68
69 if ($self->{'info_only'}) {
70 # created from classinfo.pl - don't need to parse the arguments
71 return bless $self, $class;
72 }
73
74 my $separator;
75 my $suppresslastlevel = 0;
76
77 if (!parsargv::parse(\@_,
78 q^separator/.*/^, \$separator,
79 q^suppresslastlevel^, \$suppresslastlevel,
80 "allow_extra_options")) {
81
82 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
83 $self->print_txt_usage(""); # Use default resource bundle
84 die "\n";
85 }
86
87
88 if (!$separator) {
89 $separator = "[\\\/|\\\|]";
90 }
91 $self->{'separator'} = $separator;
92 $self->{'suppresslastlevel'} = $suppresslastlevel;
93
94 # the hash that we use to build up the hierarchy
95 $self->{'path_hash'}= {};
96
97 return bless $self, $class;
98}
99
100
101sub auto_classify {
102 my $self = shift (@_);
103 my ($doc_obj,$nosort,$sortmeta,$metavalues) = @_;
104
105 my $doc_OID = $doc_obj->get_OID();
106
107 #Add all the metadata values to the hash
108 my $path_hash;
109 my $current_pos;
110
111 foreach my $metavalue (@$metavalues) {
112 $path_hash = $self->{'path_hash'};
113 my @chunks = split (/$self->{'separator'}/, $metavalue);
114 if ($self->{'suppresslastlevel'}) {
115 pop(@chunks); # remove the last element from the end
116 }
117
118 foreach my $folderName (@chunks)
119 {
120 if ($folderName ne ""){ #sometimes the tokens are empty
121 $current_pos = $self->add_To_Hash($path_hash, $folderName, $nosort);
122 $path_hash = $current_pos->{'nodes'};
123 }
124 }
125 # now add the document, with sort meta if needed
126 if ($nosort) {
127 push(@{$current_pos->{'docs'}}, $doc_OID);
128 } else {
129 if (defined $sortmeta) {
130 # can you ever get the same doc twice in one classification??
131 $current_pos->{'docs'}->{$doc_OID} = $sortmeta;
132 } else {
133 $current_pos->{'docs'}->{$doc_OID} = $metavalue;
134 }
135 }
136 } # foreach metadata
137
138}
139
140sub classify {
141 my $self = shift (@_);
142 my ($doc_obj) = @_;
143
144 my $doc_OID = $doc_obj->get_OID();
145
146 # are we sorting the list??
147 my $nosort = 0;
148 if (defined $self->{'sortname'} && $self->{'sortname'} eq "nosort") {
149 $nosort = 1;
150 }
151
152 my $metavalues = [];
153 # find all the metadata values
154 foreach $m (@{$self->{'meta_list'}}) {
155 my $mvalues = $doc_obj->get_metadata($doc_obj->get_top_section(), $m);
156 next unless (@{$mvalues});
157 if ($self->{'onlyfirst'}) {
158 # we only want the first metadata value
159 push (@$metavalues, $mvalues[0]);
160 last;
161 }
162 push (@$metavalues, @$mvalues);
163 last if (!$self->{'allvalues'}); # we don't want to try other elements
164 # cos we have already found some
165 }
166
167 return unless (@$metavalues);
168
169 #check for a sort element other than our metadata
170 my $sortmeta = undef;
171 if (!$nosort && defined $self->{'sortname'}) {
172
173 if ($self->{'sortname'} =~ /^filename$/i) {
174 $sortmeta = $doc_obj->get_source_filename();
175 } else {
176 $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'sortname'});
177 if (defined $sortmeta) {
178 $sortmeta = &sorttools::format_metadata_for_sorting($self->{'sortname'}, $sortmeta, $doc_obj);
179 }
180 }
181 $sortmeta = "" unless defined $sortmeta;
182 }
183
184 if (defined $self->{'subjectfile'}) {
185 $self->hfile_classify($doc_obj,$sortmeta,$metavalues);
186 }
187 else {
188 $self->auto_classify($doc_obj,$nosort,$sortmeta,$metavalues);
189 }
190}
191
192sub add_To_Hash {
193 my $self = shift (@_);
194 my ($myhash, $k, $nosort) = @_;
195
196 if (!defined $myhash->{$k}){
197 $myhash->{$k}={};
198 $myhash->{$k}->{'nodes'}={};
199 if ($nosort) {
200 $myhash->{$k}->{'docs'}=[];
201 } else {
202 $myhash->{$k}->{'docs'} = {};
203 }
204 }
205 return $myhash->{$k};
206}
207
208sub print_Hash{
209 my $self = shift (@_);
210 my ($myHash, $num_spaces) = @_;
211
212 foreach my $key (keys %{$myHash}){
213 print "\n";
214 $self->print_spaces($num_spaces);
215 print STDERR "$key*";
216 $self->print_Hash($myHash->{$key}, $num_spaces + 2);
217 }
218}
219
220sub print_spaces{
221 my $self = shift (@_);
222 my ($num_spaces) = @_;
223
224 for ($i = 0; $i < $num_spaces; $i++){
225 print STDERR " ";
226 }
227}
228
229sub get_entry {
230 my $self = shift (@_);
231 my ($title, $childtype, $thistype) = @_;
232
233 # organise into classification structure
234 my %classifyinfo = ('childtype'=>$childtype,
235 'Title'=>$title,
236 'contains'=>[],);
237 $classifyinfo{'thistype'} = $thistype
238 if defined $thistype && $thistype =~ /\w/;
239
240 return \%classifyinfo;
241}
242
243sub process_hash {
244 my $self = shift (@_);
245 my ($top_hash, $top_entry) = @_;
246 my ($entry);
247
248 my $hash = {};
249 foreach my $key (sort keys %{$top_hash}) {
250 $entry = $self->get_entry($key,"VList","VList");
251 my $has_content = 0;
252 my @doc_list;
253 # generate a sorted list of doc ids
254 if ($nosort && scalar(@{$top_hash->{$key}->{'docs'}})) {
255 @doc_list = @{$top_hash->{$key}->{'docs'}};
256 } elsif (!$nosort && (keys %{$top_hash->{$key}->{'docs'}})) {
257 @doc_list = sort {$top_hash->{$key}->{'docs'}->{$a}
258 cmp $top_hash->{$key}->{'docs'}->{$b};} keys %{$top_hash->{$key}->{'docs'}};
259
260 }
261 # if this key has documents, add them
262 if (@doc_list) {
263 $has_content = 1;
264 foreach $d(@doc_list) {
265 push (@{$entry->{'contains'}}, {'OID'=>$d});
266 }
267 }
268 # if this key has nodes, add them
269 if (scalar(keys %{$top_hash->{$key}->{'nodes'}})) {
270 $has_content = 1;
271 $self->process_hash($top_hash->{$key}->{'nodes'}, $entry);
272 }
273 # if we have found some content, add the new entry for this key into the parent node
274 if ($has_content) {
275 push (@{$top_entry->{'contains'}}, $entry);
276 }
277
278 }
279}
280
281sub auto_get_classify_info {
282 my $self = shift (@_);
283 my ($no_thistype) = @_;
284 $no_thistype = 0 unless defined $no_thistype;
285
286 my ($classification);
287 my $top_h = $self->{'path_hash'};
288
289 if ($self->{'path_hash'}) {
290 if ($self->{'hlist_at_top'}) {
291 $classification = $self->get_entry ($self->{'title'}, "HList", "Invisible");
292 }
293 else {
294 $classification = $self->get_entry ($self->{'title'}, "VList", "Invisible");
295 }
296 }
297
298 $self->process_hash($top_h, $classification);
299
300 return $classification;
301
302}
303
304sub auto_get_classify_info
305{
306 my $self = shift (@_);
307 my ($classifyinfo) = @_;
308
309 $self->process_hash($self->{'path_hash'}, $classifyinfo);
310
311 return $classifyinfo;
312}
313
314
315sub get_classify_info {
316 my $self = shift (@_);
317
318 my ($classifyinfo);
319
320 if ($self->{'hlist_at_top'}) {
321 $classifyinfo = $self->get_entry ($self->{'title'}, "HList", "Invisible");
322 }
323 else {
324 $classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible");
325 }
326
327 if (defined $self->{'subjectfile'}) {
328 return $self->hfile_get_classify_info($classifyinfo);
329 }
330 else {
331 return $self->auto_get_classify_info($classifyinfo);
332 }
333}
334
335
3361;
337
338
339
Note: See TracBrowser for help on using the repository browser.