########################################################################### # # Hierarchy.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # classifier plugin for generating hierarchical classifications # options for this classifier are: # -hfile file.txt - classification file # -metadata Metaname - metadata field to test against file.txt # -sort Meta - this option is optional (genious;-). by default this # classifier will sort documents within each section # alphabetically by Title. sort=nosort prevents sorting # (i.e. documents will end up in build order), sort=Meta # will sort each field alphabetically by Meta (Meta may # also be 'Filename' to sort by the original filename). # -title Title - another optional field. this is what will end up in the # Title field for this classification. if not present it # defaults to Metaname package Hierarchy; use BasClas; use util; use cfgread; use sorttools; sub BEGIN { @ISA = ('BasClas'); } sub print_usage { print STDERR " usage: classify Hierarchy [options] options: -title X Title field for this classification. Defaults to metadata name. -metadata X Metadata field used for classification, list will be sorted by this element. -hfile X The classification structure file -sort X Metadata field to sort by (defaults to none) "; } sub new { my $class = shift (@_); my $self = new BasClas($class, @_); my $sortname = "Title"; my ($hfile, $metadata, $title); if (!parsargv::parse(\@_, q^title/.*/^, \$title, q^sort/.*/nosort^, \$sortname, q^hfile/.*/^, \$hfile, q^metadata/.*/^, \$metadata, "allow_extra_options")) { print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n"; &print_usage(); die "\n"; } if (!$metadata) { &print_usage; print STDERR "\nHierarchy error: no metadata supplied\n"; die "\n"; } $title = $metadata unless ($title); $sortname = undef if $sortname =~ /^nosort$/; if (!$hfile) { &print_usage; print STDERR "\nHierarchy error: No -hfile supplied\n"; die "\n"; } my $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile); if (!-e $subjectfile) { my $collfile = $subjectfile; $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile); if (!-e $subjectfile) { my $outhandle = $self->{'outhandle'}; &print_usage; print STDERR "\nHierarchy Error: Can't locate subject file $hfile\n"; print STDERR "This file should be in $collfile or $subjectfile\n"; die "\n"; } } $self->{'descriptorlist'} = {}; # first field in subject file $self->{'locatorlist'} = {}; # second field in subject file $self->{'subjectfile'} = $subjectfile; $self->{'metaname'} = $metadata; $self->{'sortname'} = $sortname; $self->{'title'} = $title; return bless $self, $class; } sub init { my $self = shift (@_); # read in the subject file my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\w'); # $list is a hash that is indexed by the descriptor. The contents of this # hash is a list of two items. The first item is the OID and the second item # is the title foreach $descriptor (keys (%$list)) { $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0]; unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) { $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1]; $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = []; } } } sub classify { my $self = shift (@_); my ($doc_obj) = @_; my $doc_OID = $doc_obj->get_OID(); my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(), $self->{'metaname'}); my $sortmeta = ""; if (defined $self->{'sortname'}) { if ($self->{'sortname'} =~ /^filename$/i) { $sortmeta = $doc_obj->get_source_filename(); } else { $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'sortname'}); if (defined $sortmeta) { if ($self->{'sortname'} eq "Creator") { &sorttools::format_string_name_english (\$sortmeta); } else { &sorttools::format_string_english (\$sortmeta); } } } $sortmeta = "" unless defined $sortmeta; } foreach $metaelement (@$metadata) { if ((defined $self->{'descriptorlist'}->{$metaelement}) && (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) { push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}}, [$doc_OID, $sortmeta]); } } } sub get_classify_info { my $self = shift (@_); my $list = $self->{'locatorlist'}; my $classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible"); foreach $OID (keys (%$list)) { my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "VList"); if (defined $self->{'sortname'}) { foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) { push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]}); } } else { foreach $subOID (@{$list->{$OID}->{'contents'}}) { push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]}); } } } return $classifyinfo; } sub get_OID_entry { my $self = shift (@_); my ($OID, $classifyinfo, $title, $classifytype) = @_; $OID = "" unless defined $OID; $OID =~ s/^\.+//; my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/; $tailOID = "" unless defined $tailOID; if (!defined $headOID) { $classifyinfo->{'Title'} = $title; $classifyinfo->{'classifytype'} = $classifytype; return $classifyinfo; } $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'}; # add entries to the contains list until we have one for headOID #### the +10 is a hack that works but I'm not completely sure why #### and don't have time to delve deeper. one day someone should #### fix this ;-) -- Stefan while (scalar(@{$classifyinfo->{'contains'}}) <= ($headOID+10)) { push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype)); } return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID-1)], $title, $classifytype); } sub get_entry { my $self = shift (@_); my ($title, $childtype, $thistype) = @_; # organise into classification structure my %classifyinfo = ('childtype'=>$childtype, 'Title'=>$title, 'contains'=>[]); $classifyinfo{'thistype'} = $thistype if defined $thistype && $thistype =~ /\w/; return \%classifyinfo; } 1;