source: trunk/gsdl/perllib/classify/Hierarchy.pm@ 1947

Last change on this file since 1947 was 1947, checked in by dmm9, 23 years ago

updated documentation

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.6 KB
Line 
1###########################################################################
2#
3# Hierarchy.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for generating hierarchical classifications
27
28# options for this classifier are:
29# -hfile file.txt - classification file
30# -metadata Metaname - metadata field to test against file.txt
31# -sort Meta - this option is optional (genious;-). by default this
32# classifier will sort documents within each section
33# alphabetically by Title. sort=nosort prevents sorting
34# (i.e. documents will end up in build order), sort=Meta
35# will sort each field alphabetically by Meta (Meta may
36# also be 'Filename' to sort by the original filename).
37# -title Title - another optional field. this is what will end up in the
38# Title field for this classification. if not present it
39# defaults to Metaname
40
41package Hierarchy;
42
43use BasClas;
44use util;
45use cfgread;
46use sorttools;
47
48sub BEGIN {
49 @ISA = ('BasClas');
50}
51
52sub print_usage {
53 print STDERR "
54 usage: classify Hierarchy [options]
55 options:
56
57 -title X Title field for this classification.
58 Defaults to metadata name.
59
60 -metadata X Metadata field used for classification,
61 list will be sorted by this element.
62
63 -hfile X The classification structure file
64
65 -sort X Metadata field to sort by (defaults to none)
66";
67}
68
69
70sub new {
71 my $class = shift (@_);
72 my $self = new BasClas($class, @_);
73
74 my $sortname = "Title";
75 my ($hfile, $metadata, $title);
76
77 if (!parsargv::parse(\@_,
78 q^title/.*/^, \$title,
79 q^sort/.*/nosort^, \$sortname,
80 q^hfile/.*/^, \$hfile,
81 q^metadata/.*/^, \$metadata,
82 "allow_extra_options")) {
83
84 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
85 &print_usage();
86 die "\n";
87 }
88
89 if (!$metadata) {
90 &print_usage;
91 print STDERR "\nHierarchy error: no metadata supplied\n";
92 die "\n";
93 }
94
95 $title = $metadata unless ($title);
96
97 $sortname = undef if $sortname =~ /^nosort$/;
98
99 if (!$hfile) {
100 &print_usage;
101 print STDERR "\nHierarchy error: No -hfile supplied\n";
102 die "\n";
103 }
104
105 my $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile);
106 if (!-e $subjectfile) {
107 my $collfile = $subjectfile;
108 $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile);
109 if (!-e $subjectfile) {
110 my $outhandle = $self->{'outhandle'};
111 &print_usage;
112 print STDERR "\nHierarchy Error: Can't locate subject file $hfile\n";
113 print STDERR "This file should be in $collfile or $subjectfile\n";
114 die "\n";
115 }
116 }
117
118 $self->{'descriptorlist'} = {}; # first field in subject file
119 $self->{'locatorlist'} = {}; # second field in subject file
120 $self->{'subjectfile'} = $subjectfile;
121 $self->{'metaname'} = $metadata;
122 $self->{'sortname'} = $sortname;
123 $self->{'title'} = $title;
124
125 return bless $self, $class;
126}
127
128sub init {
129 my $self = shift (@_);
130
131 # read in the subject file
132 my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\w');
133
134 # $list is a hash that is indexed by the descriptor. The contents of this
135 # hash is a list of two items. The first item is the OID and the second item
136 # is the title
137 foreach $descriptor (keys (%$list)) {
138 $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
139 unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
140 $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
141 $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
142 }
143 }
144}
145
146sub classify {
147 my $self = shift (@_);
148 my ($doc_obj) = @_;
149
150 my $doc_OID = $doc_obj->get_OID();
151
152 my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(),
153 $self->{'metaname'});
154
155 my $sortmeta = "";
156 if (defined $self->{'sortname'}) {
157 if ($self->{'sortname'} =~ /^filename$/i) {
158 $sortmeta = $doc_obj->get_source_filename();
159 } else {
160 $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
161 $self->{'sortname'});
162 if (defined $sortmeta) {
163 if ($self->{'sortname'} eq "Creator") {
164 &sorttools::format_string_name_english (\$sortmeta);
165 } else {
166 &sorttools::format_string_english (\$sortmeta);
167 }
168 }
169 }
170 $sortmeta = "" unless defined $sortmeta;
171 }
172
173 foreach $metaelement (@$metadata) {
174 if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
175 (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
176 push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
177 [$doc_OID, $sortmeta]);
178 }
179 }
180}
181
182sub get_classify_info {
183 my $self = shift (@_);
184
185 my $list = $self->{'locatorlist'};
186
187 my $classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible");
188 foreach $OID (keys (%$list)) {
189
190 my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "VList");
191
192 if (defined $self->{'sortname'}) {
193 foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
194 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
195 }
196 } else {
197 foreach $subOID (@{$list->{$OID}->{'contents'}}) {
198 push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
199 }
200 }
201 }
202
203 return $classifyinfo;
204}
205
206sub get_OID_entry {
207 my $self = shift (@_);
208 my ($OID, $classifyinfo, $title, $classifytype) = @_;
209
210 $OID = "" unless defined $OID;
211 $OID =~ s/^\.+//;
212
213 my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/;
214 $tailOID = "" unless defined $tailOID;
215
216 if (!defined $headOID) {
217 $classifyinfo->{'Title'} = $title;
218 $classifyinfo->{'classifytype'} = $classifytype;
219 return $classifyinfo;
220 }
221
222 $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
223
224 # add entries to the contains list until we have one for headOID
225 #### the +10 is a hack that works but I'm not completely sure why
226 #### and don't have time to delve deeper. one day someone should
227 #### fix this ;-) -- Stefan
228 while (scalar(@{$classifyinfo->{'contains'}}) <= ($headOID+10)) {
229 push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
230 }
231
232 return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID-1)], $title, $classifytype);
233}
234
235sub get_entry {
236 my $self = shift (@_);
237 my ($title, $childtype, $thistype) = @_;
238
239 # organise into classification structure
240 my %classifyinfo = ('childtype'=>$childtype,
241 'Title'=>$title,
242 'contains'=>[]);
243 $classifyinfo{'thistype'} = $thistype
244 if defined $thistype && $thistype =~ /\w/;
245
246 return \%classifyinfo;
247}
248
249
2501;
Note: See TracBrowser for help on using the repository browser.