source: main/trunk/greenstone2/perllib/classify/HFileHierarchy.pm@ 23116

Last change on this file since 23116 was 23116, checked in by kjdon, 14 years ago

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1###########################################################################
2#
3# HFileHierarchy.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for generating hierarchical classifications
27
28
29package HFileHierarchy;
30
31use BaseClassifier;
32use util;
33use cfgread;
34use sorttools;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39sub BEGIN {
40 @HFileHierarchy::ISA = ('BaseClassifier');
41}
42
43my $arguments =
44 [ { 'name' => "metadata",
45 'desc' => "{AZCompactList.metadata}",
46 'type' => "metadata",
47 'reqd' => "yes" },
48 { 'name' => "firstvalueonly",
49 'desc' => "{AZCompactList.firstvalueonly}",
50 'type' => "flag",
51 'reqd' => "no" },
52 { 'name' => "allvalues",
53 'desc' => "{AZCompactList.allvalues}",
54 'type' => "flag",
55 'reqd' => "no" },
56 { 'name' => "hfile",
57 'desc' => "{Hierarchy.hfile}",
58 'type' => "string",
59 'deft' => "",
60 'reqd' => "no" },
61 { 'name' => "sort",
62 'desc' => "{Hierarchy.sort}",
63 'type' => "metadata",
64 'reqd' => "no" },
65 { 'name' => "reverse_sort",
66 'desc' => "{Hierarchy.reverse_sort}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "hlist_at_top",
70 'desc' => "{Hierarchy.hlist_at_top}",
71 'type' => "flag",
72 'reqd' => "no" },
73 { 'name' => "documents_last",
74 'desc' => "{Hierarchy.documents_last}",
75 'type' => "flag",
76 'reqd' => "no"}
77 ];
78
79my $options =
80{ 'name' => "HFileHierarchy",
81 'desc' => "{HFileHierarchy.desc}",
82 'abstract' => "yes",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86
87sub new {
88 my ($class) = shift (@_);
89 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
90 push(@$classifierslist, $class);
91
92 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93 push(@{$hashArgOptLists->{"OptList"}},$options);
94
95 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
96
97 if ($self->{'info_only'}) {
98 # don't worry about any options etc
99 return bless $self, $class;
100 }
101
102 my $metadata = $self->{'metadata'};
103 if (!$metadata) {
104 print STDERR "$class Error: required option -metadata not supplied\n";
105 $self->print_txt_usage(""); # Use default resource bundle
106
107 die "$class Error: required option -metadata not supplied\n";
108 }
109
110 $self->{'buttonname'} = $self->generate_title_from_metadata($metadata) unless ($self->{'buttonname'});
111 # strip ex from metadata
112 $metadata = $self->strip_ex_from_metadata($metadata);
113
114 my @meta_list = split(/,/, $metadata);
115 $self->{'meta_list'} = \@meta_list;
116
117 # sort = undef in this case is the same as sort=nosort
118 if ($self->{'sort'} eq "nosort") {
119 $self->{'sort'} = undef;
120 }
121 if (defined $self->{'sort'}) { # remove ex. namespace
122 $self->{'sort'} = $self->strip_ex_from_metadata($self->{'sort'});
123 }
124
125 if ($self->{'hfile'}) {
126 my $hfile = $self->{'hfile'};
127 my $subjectfile;
128 $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile);
129 if (!-e $subjectfile) {
130 my $collfile = $subjectfile;
131 $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile);
132 if (!-e $subjectfile) {
133 my $outhandle = $self->{'outhandle'};
134 print STDERR "\nHFileHierarchy Error: Can't locate subject file $hfile\n";
135 print STDERR "This file should be in $collfile or $subjectfile\n";
136 $self->print_txt_usage(""); # Use default resource bundle
137 print STDERR "\nHFileHierarchy Error: Can't locate subject file $hfile\n";
138 print STDERR "This file should be in $collfile or $subjectfile\n";
139 die "\n";
140 }
141 }
142 $self->{'descriptorlist'} = {}; # first field in subject file
143 $self->{'locatorlist'} = {}; # second field in subject file
144 $self->{'subjectfile'} = $subjectfile;
145 }
146
147
148 # $self->{'firstvalueonly'} = $firstvalueonly;
149 # $self->{'allvalues'} = $allvalues;
150
151 #$self->{'hlist_at_top'} = $hlist_at_top;
152
153 # Clean out the unused keys
154 delete $self->{'metadata'};
155 delete $self->{'hfile'};
156
157 return bless $self, $class;
158}
159
160sub init {
161 my $self = shift (@_);
162
163 my $subjectfile = $self->{'subjectfile'};
164 if (defined $subjectfile) {
165 # read in the subject file
166 my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\S');
167 # $list is a hash that is indexed by the descriptor. The contents of this
168 # hash is a list of two items. The first item is the OID and the second item
169 # is the title
170 foreach my $descriptor (keys (%$list)) {
171 $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
172 unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
173 $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
174 $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
175 }
176 }
177 }
178}
179
180sub hfile_classify
181{
182 my $self = shift (@_);
183 my ($doc_obj,$sortmeta,$metavalues) = @_;
184
185 my $outhandle = $self->{'outhandle'};
186
187 my $doc_OID = $doc_obj->get_OID();
188
189 foreach my $metaelement (@$metavalues) {
190 if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
191 (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
192
193 push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
194 [$doc_OID, $sortmeta]);
195 my $localid = $self->{'descriptorlist'}->{$metaelement};
196 my $classid = $self->get_number();
197
198 $doc_obj->add_metadata($doc_obj->get_top_section(), "memberof", "CL$classid.$localid");
199
200 }
201 }
202}
203
204
205
206
207sub hfile_get_classify_info {
208 my $self = shift (@_);
209
210 my ($classifyinfo) = @_;
211
212 my $list = $self->{'locatorlist'};
213
214 my $classifier_num = "CL".$self->get_number();
215 # sorted the keys - otherwise funny things happen - kjdon 03/01/03
216 foreach my $OID (sort keys (%$list)) {
217 my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, "$classifier_num.$OID", $list->{$OID}->{'title'}, "VList");
218
219 if (defined $self->{'sort'}) {
220 if ($self->{'reverse_sort'}) {
221 foreach my $subOID (sort {$b->[1] cmp $a->[1];} @{$list->{$OID}->{'contents'}}) {
222 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
223 }
224 }
225 else {
226 foreach my $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
227 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
228 }
229 }
230 }
231 else {
232 foreach my $subOID (@{$list->{$OID}->{'contents'}}) {
233 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
234 }
235 }
236 }
237
238 return $classifyinfo;
239}
240
241
242sub supports_memberof {
243 my $self = shift(@_);
244
245 return "true";
246}
247
248sub get_OID_entry {
249 my $self = shift (@_);
250 my ($OID, $classifyinfo, $classifyOID, $title, $classifytype) = @_;
251
252 $OID = "" unless defined $OID;
253 $OID =~ s/^\.+//;
254
255 my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/;
256 $tailOID = "" unless defined $tailOID;
257
258 if (!defined $headOID) {
259 $classifyinfo->{'Title'} = $title;
260 $classifyinfo->{'classifyOID'} = $classifyOID;
261 $classifyinfo->{'classifytype'} = $classifytype;
262 return $classifyinfo;
263 }
264
265 $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
266 if ($self->{'documents_last'}) {
267 # documents should come after nodes in the classifier
268
269 my $doc_pos = 0;
270 foreach my $thing (@{$classifyinfo->{'contains'}}) {
271 last if defined $thing->{'OID'};
272 $doc_pos++;
273 }
274
275 while ($doc_pos < $headOID) {
276 splice(@{$classifyinfo->{'contains'}}, $doc_pos, 0, $self->get_entry("", $classifytype));
277 $doc_pos++;
278 }
279
280 return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID-1)], $classifyOID, $title, $classifytype);
281
282 }
283
284 # else, documents come before nodes
285 my $offset = 0;
286 foreach my $thing (@{$classifyinfo->{'contains'}}) {
287 $offset ++ if defined $thing->{'OID'};
288 }
289
290 while (scalar(@{$classifyinfo->{'contains'}}) < ($headOID+$offset)) {
291 push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
292 }
293
294 return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID+$offset-1)], $classifyOID, $title, $classifytype);
295}
296
297sub get_entry {
298 my $self = shift (@_);
299 my ($title, $childtype, $thistype) = @_;
300 my $memberof = &supports_memberof();
301
302 # organise into classification structure
303 my %classifyinfo = ('childtype'=>$childtype,
304 'Title'=>$title,
305 'supportsmemberof'=>$memberof,
306 'contains'=>[]);
307 $classifyinfo{'thistype'} = $thistype
308 if defined $thistype && $thistype =~ /\w/;
309
310 return \%classifyinfo;
311}
312
313
3141;
Note: See TracBrowser for help on using the repository browser.