source: main/tags/2.30/gsdl/perllib/classify/Hierarchy.pm@ 23841

Last change on this file since 23841 was 1608, checked in by nzdl, 24 years ago

Inserted an ugly hack into the Hierarchy classifier to mask a bug that's
been lurking about for a while. One day I'll fix it properly... maybe.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1###########################################################################
2#
3# Hierarchy.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for generating hierarchical classifications
27
28# options for this classifier are:
29# hfile=file.txt - classification file
30# metadata=Metaname - metadata field to test against file.txt
31# sort=Meta - this option is optional (genious;-). by default this
32# classifier will sort documents within each section
33# alphabetically by Title. sort=nosort prevents sorting
34# (i.e. documents will end up in build order), sort=Meta
35# will sort each field alphabetically by Meta (Meta may
36# also be 'Filename' to sort by the original filename).
37# title=Title - another optional field. this is what will end up in the
38# Title field for this classification. if not present it
39# defaults to Metaname
40
41package Hierarchy;
42
43use BasClas;
44use util;
45use cfgread;
46use sorttools;
47
48sub BEGIN {
49 @ISA = ('BasClas');
50}
51
52sub new {
53 my ($class, @options) = @_;
54 my $self = new BasClas ($class, @_);
55
56 my $sortname = "Title";
57 my ($hfile, $metadata, $title);
58
59 foreach $option (@options) {
60 if ($option =~ /^sort=(.*)$/i) {
61 $sortname = $1;
62 $sortname = undef if $sortname =~ /^nosort$/i;
63 } elsif ($option =~ /^hfile=(.*)$/i) {
64 $hfile = $1;
65 } elsif ($option =~ /^metadata=(.*)$/i) {
66 $metadata = $1;
67 } elsif ($option =~ /^title=(.*)$/i) {
68 $title = $1;
69 }
70 }
71
72 if (!defined ($hfile) || !defined ($metadata)) {
73 my $outhandle = $self->{'outhandle'};
74 print $outhandle "Error in options passed to Hierarchy classification\n";
75 print $outhandle "hfile and metadata options aren't optional\n\n";
76 print $outhandle "Options:\n";
77 print $outhandle "hfile -- the classification structure file\n";
78 print $outhandle "metadata -- metadata field to test against hfile\n";
79 print $outhandle "sort -- metadata field to sort by\n";
80 print $outhandle "title -- title of classification\n\n";
81 die "\n";
82 }
83
84 $title = $metadata unless defined $title;
85
86 my $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile);
87 if (!-e $subjectfile) {
88 my $collfile = $subjectfile;
89 $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile);
90 if (!-e $subjectfile) {
91 my $outhandle = $self->{'outhandle'};
92 print $outhandle "Hierarchy Error: Can't locate subject file $hfile\n";
93 print $outhandle "This file should be in $collfile or $subjectfile\n";
94 die "\n";
95 }
96 }
97
98 $self->{'descriptorlist'} = {}; # first field in subject file
99 $self->{'locatorlist'} = {}; # second field in subject file
100 $self->{'subjectfile'} = $subjectfile;
101 $self->{'metaname'} = $metadata;
102 $self->{'sortname'} = $sortname;
103 $self->{'title'} = $title;
104
105 return bless $self, $class;
106}
107
108sub init {
109 my $self = shift (@_);
110
111 # read in the subject file
112 my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\w');
113
114 # $list is a hash that is indexed by the descriptor. The contents of this
115 # hash is a list of two items. The first item is the OID and the second item
116 # is the title
117 foreach $descriptor (keys (%$list)) {
118 $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
119 unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
120 $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
121 $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
122 }
123 }
124}
125
126sub classify {
127 my $self = shift (@_);
128 my ($doc_obj) = @_;
129
130 my $doc_OID = $doc_obj->get_OID();
131
132 my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(),
133 $self->{'metaname'});
134
135 my $sortmeta = "";
136 if (defined $self->{'sortname'}) {
137 if ($self->{'sortname'} =~ /^filename$/i) {
138 $sortmeta = $doc_obj->get_source_filename();
139 } else {
140 $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
141 $self->{'sortname'});
142 if (defined $sortmeta) {
143 if ($self->{'sortname'} eq "Creator") {
144 &sorttools::format_string_name_english (\$sortmeta);
145 } else {
146 &sorttools::format_string_english (\$sortmeta);
147 }
148 }
149 }
150 $sortmeta = "" unless defined $sortmeta;
151 }
152
153 foreach $metaelement (@$metadata) {
154 if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
155 (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
156 push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
157 [$doc_OID, $sortmeta]);
158 }
159 }
160}
161
162sub get_classify_info {
163 my $self = shift (@_);
164
165 my $list = $self->{'locatorlist'};
166
167 my $classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible");
168 foreach $OID (keys (%$list)) {
169
170 my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "VList");
171
172 if (defined $self->{'sortname'}) {
173 foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
174 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
175 }
176 } else {
177 foreach $subOID (@{$list->{$OID}->{'contents'}}) {
178 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
179 }
180 }
181 }
182
183 return $classifyinfo;
184}
185
186sub get_OID_entry {
187 my $self = shift (@_);
188 my ($OID, $classifyinfo, $title, $classifytype) = @_;
189
190 $OID = "" unless defined $OID;
191 $OID =~ s/^\.+//;
192
193 my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/;
194 $tailOID = "" unless defined $tailOID;
195
196 if (!defined $headOID) {
197 $classifyinfo->{'Title'} = $title;
198 $classifyinfo->{'classifytype'} = $classifytype;
199 return $classifyinfo;
200 }
201
202 $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
203
204 # add entries to the contains list until we have one for headOID
205 #### the +10 is a hack that works but I'm not completely sure why
206 #### and don't have time to delve deeper. one day someone should
207 #### fix this ;-) -- Stefan
208 while (scalar(@{$classifyinfo->{'contains'}}) <= ($headOID+10)) {
209 push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
210 }
211
212 return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID-1)], $title, $classifytype);
213}
214
215sub get_entry {
216 my $self = shift (@_);
217 my ($title, $childtype, $thistype) = @_;
218
219 # organise into classification structure
220 my %classifyinfo = ('childtype'=>$childtype,
221 'Title'=>$title,
222 'contains'=>[]);
223 $classifyinfo{'thistype'} = $thistype
224 if defined $thistype && $thistype =~ /\w/;
225
226 return \%classifyinfo;
227}
228
229
2301;
Note: See TracBrowser for help on using the repository browser.