Changeset 9206


Ignore:
Timestamp:
2005-02-28T12:09:17+13:00 (19 years ago)
Author:
davidb
Message:

Reworking of AutoHierarchy and Hierarchy so they are merged as one "super"
hierarchy classifier that is backward compatible.

The original Hierarchy is now HFileHierarchy. This is now an abstract
class that encapsulates everything needed to use the -hfile option.

AutoHierarchy has now been renamed to Hierarchy. Classifiers options between
the two have been merged and kept backwards compatible. If a user specifies
-hfile to Hierarhcy it is patched through to HFileHierarchy to do things
the way they used to be done. If no -hfile flag is specified then the
newer "auto" ability is used.

Location:
trunk/gsdl/perllib/classify
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/Hierarchy.pm

    r8852 r9206  
    11###########################################################################
    22#
    3 # Hierarchy.pm --
     3# Hierarchy.pm -- classifier that enables a Hierarchy to beformed without
     4#                 the need for a hierarchy file (like HFileHierarchy). Used
     5#                 to be called AutoHierarchy.  Inherits from HFileHierarchy
     6#                 so can also do everything that does as well.
     7#                 Created by Imene, modified by Katherine and David.
     8#
    49# A component of the Greenstone digital library software
    510# from the New Zealand Digital Library Project at the
     
    2429###########################################################################
    2530
    26 # classifier plugin for generating hierarchical classifications
    27 
     31# An advanced Hierarchical classifier
     32# to see the options, run "perl -S classinfo.pl Hierarchy"
    2833
    2934package Hierarchy;
    3035
    31 use BasClas;
    32 use util;
    33 use cfgread;
     36use HFileHierarchy;
    3437use sorttools;
    3538
    3639sub BEGIN {
    37     @Hierarchy::ISA = ('BasClas');
    38 }
    39 
    40 my $arguments =
    41     [ { 'name' => "metadata",
    42     'desc' => "{Hierarchy.metadata}",
    43     'type' => "metadata",
    44     'reqd' => "yes" },
    45       { 'name' => "hfile",
    46     'desc' => "{Hierarchy.hfile}",
    47     'type' => "string",
    48     'deft' => "",
    49     'reqd' => "yes" },
    50       { 'name' => "buttonname",
    51     'desc' => "{BasClas.buttonname}",
    52     'type' => "string",
    53     'deft' => "",
     40    @ISA = ('HFileHierarchy');
     41}
     42
     43my $arguments =
     44    [ { 'name' => "separator",
     45    'desc' => "{AutoHierarchy.separator}",
     46    'type' => "regexp",
     47    'deft' => "[\\\\\\\/|\\\\\\\|]",
    5448    'reqd' => "no" },
    55       { 'name' => "sort",
    56     'desc' => "{Hierarchy.sort}",
    57     'type' => "string",
    58     'deft' => "{BasClas.metadata.deft}",
    59     'reqd' => "no" },
    60       { 'name' => "reverse_sort",
    61     'desc' => "{Hierarchy.reverse_sort}",
    62     'type' => "flag",
    63     'reqd' => "no" },
    64       { 'name' => "hlist_at_top",
    65     'desc' => "{Hierarchy.hlist_at_top}",
     49      { 'name' => "suppresslastlevel",
     50    'desc' => "{AutoHierarchy.suppresslastlevel}",
    6651    'type' => "flag",
    6752    'reqd' => "no" } ];
    6853
    69 my $options =
    70 {   'name'     => "Hierarchy",
    71     'desc'     => "{Hierarchy.desc}",
    72     'abstract' => "no",
    73     'inherits' => "yes",
    74     'args'     => $arguments };
     54my $options = { 'name'     => "Hierarchy",
     55        'desc'     => "{AutoHierarchy.desc}",
     56        'abstract' => "no",
     57        'inherits' => "yes",
     58        'args'     => $arguments };
    7559
    7660
    7761sub new {
    7862    my $class = shift (@_);
    79     my $self = new BasClas($class, @_);
    80  
     63    my $self = new HFileHierarchy($class, @_);
     64
     65    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
    8166    my $option_list = $self->{'option_list'};
    8267    push( @{$option_list}, $options );
    83  
     68
    8469    if ($self->{'info_only'}) {
    8570    # created from classinfo.pl - don't need to parse the arguments
    8671    return bless $self, $class;
    8772    }
    88  
    89     my ($hfile, $metadata, $sortname, $reverse_sort, $title, $hlist_at_top);
    90    
     73
     74    my $separator;
     75    my $suppresslastlevel = 0;
     76
    9177    if (!parsargv::parse(\@_,
    92              q^buttonname/.*/^, \$title,
    93              q^sort/.*/^, \$sortname,
    94              q^reverse_sort^, \$reverse_sort,
    95              q^hfile/.*/^, \$hfile,
    96              q^metadata/.*/^, \$metadata,
    97              q^hlist_at_top^, \$hlist_at_top,
     78             q^separator/.*/^, \$separator,
     79             q^suppresslastlevel^, \$suppresslastlevel,
    9880             "allow_extra_options")) {
    9981   
     
    10183    $self->print_txt_usage("");  # Use default resource bundle
    10284    die "\n";
    103 
    104     }
    105 
    106     if (!$metadata) {
    107     print STDERR "$class Error: required option -metadata not supplied\n";
    108     $self->print_txt_usage("");  # Use default resource bundle
     85    }
     86
     87   
     88    if (!$separator) {
     89    $separator = "[\\\/|\\\|]";
     90    }
     91    $self->{'separator'} = $separator;
     92    $self->{'suppresslastlevel'} = $suppresslastlevel;
     93   
     94    # the hash that we use to build up the hierarchy
     95    $self->{'path_hash'}= {};
     96   
     97    return bless $self, $class;
     98}
     99
     100
     101sub auto_classify {
     102    my $self = shift (@_);
     103    my ($doc_obj,$nosort,$sortmeta,$metavalues) = @_;
     104
     105    my $doc_OID = $doc_obj->get_OID();
     106   
     107    #Add all the metadata values to the hash
     108    my $path_hash;
     109    my $current_pos;
     110   
     111    foreach my $metavalue (@$metavalues) {
     112    $path_hash = $self->{'path_hash'};
     113    my @chunks = split (/$self->{'separator'}/, $metavalue);
     114    if ($self->{'suppresslastlevel'}) {
     115        pop(@chunks); # remove the last element from the end
     116    }
    109117   
    110     die "$class Error: required option -metadata not supplied\n";
    111     }
    112    
    113     if (!$hfile) {
    114     print STDERR "$class Error: required option -hfile not supplied\n";
    115     $self->print_txt_usage("");  # Use default resource bundle
    116    
    117     die "$class Error: required option -hfile not supplied\n";
    118     }
    119    
    120     $title = $metadata unless ($title);
    121     # if no sortname specified, it defaults to metadata
    122     $sortname = $metadata unless ($sortname);
    123     $sortname = undef if $sortname =~ /^nosort$/;
    124     if (defined $sortname && $reverse_sort) {
    125     $self->{'reverse_sort'} = 1;
    126     }
    127    
    128     my $subjectfile;
    129     $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile);
    130     if (!-e $subjectfile) {
    131     my $collfile = $subjectfile;
    132     $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile);
    133     if (!-e $subjectfile) {
    134         my $outhandle = $self->{'outhandle'};
    135         print STDERR "\nHierarchy Error: Can't locate subject file $hfile\n";
    136         print STDERR "This file should be in $collfile or $subjectfile\n";
    137         $self->print_txt_usage("");  # Use default resource bundle
    138         print STDERR "\nHierarchy Error: Can't locate subject file $hfile\n";
    139         print STDERR "This file should be in $collfile or $subjectfile\n";
    140         die "\n";
    141     }
    142     }
    143    
    144     $self->{'descriptorlist'} = {}; # first field in subject file
    145     $self->{'locatorlist'} = {}; # second field in subject file
    146     $self->{'subjectfile'} = $subjectfile;
    147     $self->{'metaname'} = $metadata;
    148     $self->{'sortname'} = $sortname;
    149     $self->{'title'} = $title;
    150     $self->{'hlist_at_top'} = $hlist_at_top;
    151    
    152     return bless $self, $class;
    153 }
    154 
    155 sub init {
    156     my $self = shift (@_);
    157    
    158     # read in the subject file
    159     my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\S');
    160     # $list is a hash that is indexed by the descriptor. The contents of this
    161     # hash is a list of two items. The first item is the OID and the second item
    162     # is the title
    163     foreach $descriptor (keys (%$list)) {
    164     $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
    165     unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
    166         $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
    167         $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
    168     }
    169     }
     118    foreach my $folderName (@chunks)
     119    {
     120        if ($folderName ne ""){ #sometimes the tokens are empty
     121        $current_pos = $self->add_To_Hash($path_hash, $folderName, $nosort);
     122        $path_hash = $current_pos->{'nodes'};
     123        }
     124    }
     125    # now add the document, with sort meta if needed
     126    if ($nosort) {
     127        push(@{$current_pos->{'docs'}}, $doc_OID);
     128    } else {
     129        if (defined $sortmeta) {
     130        # can you ever get the same doc twice in one classification??
     131        $current_pos->{'docs'}->{$doc_OID} = $sortmeta;
     132        } else {
     133        $current_pos->{'docs'}->{$doc_OID} = $metavalue;
     134        }
     135    }
     136    } # foreach metadata
     137
    170138}
    171139
     
    173141    my $self = shift (@_);
    174142    my ($doc_obj) = @_;
    175    
     143
    176144    my $doc_OID = $doc_obj->get_OID();
    177    
    178     my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(),
    179                        $self->{'metaname'});
    180    
    181     my $lang = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), 'Language');
    182     $lang = 'en' unless defined $lang;
    183 
    184     my $sortmeta = "";
    185     if (defined $self->{'sortname'}) {
     145
     146    # are we sorting the list??
     147    my $nosort = 0;
     148    if (defined $self->{'sortname'} && $self->{'sortname'} eq "nosort") {
     149    $nosort = 1;
     150    }
     151
     152    my $metavalues = [];
     153    # find all the metadata values
     154    foreach $m (@{$self->{'meta_list'}}) {
     155    my $mvalues = $doc_obj->get_metadata($doc_obj->get_top_section(), $m);
     156    next unless (@{$mvalues});
     157    if ($self->{'onlyfirst'}) {
     158        # we only want the first metadata value
     159        push (@$metavalues, $mvalues[0]);
     160        last;
     161    }
     162    push (@$metavalues, @$mvalues);
     163    last if (!$self->{'allvalues'}); # we don't want to try other elements
     164                                     # cos we have already found some
     165    }
     166   
     167    return unless (@$metavalues);
     168
     169    #check for a sort element other than our metadata
     170    my $sortmeta = undef;
     171    if (!$nosort && defined $self->{'sortname'}) {
     172       
    186173    if ($self->{'sortname'} =~ /^filename$/i) {
    187174        $sortmeta = $doc_obj->get_source_filename();
    188175    } else {
    189         $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
    190                                $self->{'sortname'});
     176        $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'sortname'});
    191177        if (defined $sortmeta) {
    192178        $sortmeta = &sorttools::format_metadata_for_sorting($self->{'sortname'}, $sortmeta, $doc_obj);
    193         #if ($self->{'sortname'} eq "Creator") {
    194         #    if ($lang eq 'en') {
    195         #   &sorttools::format_string_name_english (\$sortmeta);
    196         #    }
    197         #} else {
    198         #    if ($lang eq 'en') {
    199         #   &sorttools::format_string_english (\$sortmeta);
    200         #    }
    201         #}
    202179        }
    203180    }
    204181    $sortmeta = "" unless defined $sortmeta;
    205182    }
    206    
    207     foreach $metaelement (@$metadata) {
    208     if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
    209         (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
    210         push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
    211           [$doc_OID, $sortmeta]);
    212         my $localid = $self->{'descriptorlist'}->{$metaelement};
    213         my $classid = $self->get_number();
    214 
    215         $doc_obj->add_metadata($doc_obj->get_top_section(), "memberof", "CL$classid.$localid");
    216     }
    217     }
    218 }
    219 
    220 sub get_classify_info {
    221     my $self = shift (@_);
    222    
    223     my $list = $self->{'locatorlist'};
    224    
    225     my ($classifyinfo);
    226     if ($self->{'hlist_at_top'}) {
    227     $classifyinfo = $self->get_entry ($self->{'title'}, "HList", "Invisible");
    228     } else {
    229     $classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible");
    230     }
    231     # sorted the keys - otherwise funny things happen - kjdon 03/01/03
    232     foreach $OID (sort keys (%$list)) {
    233     my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "VList");
    234    
    235     if (defined $self->{'sortname'}) {
    236         if ($self->{'reverse_sort'}) {
    237         foreach $subOID (sort {$b->[1] cmp $a->[1];} @{$list->{$OID}->{'contents'}}) {
    238             push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
    239         }
    240         }
    241         else {
    242         foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
    243             push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
    244         }
    245         }
    246     }
    247     else {
    248         foreach $subOID (@{$list->{$OID}->{'contents'}}) {
    249         push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
    250         }
    251     }
    252     }
    253    
    254     return $classifyinfo;
    255 }
    256 
    257 sub supports_memberof {
    258     my $self = shift(@_);
    259 
    260     return "true";
    261 }
    262 
    263 sub get_OID_entry {
    264     my $self = shift (@_);
    265     my ($OID, $classifyinfo, $title, $classifytype) = @_;
    266 
    267     $OID = "" unless defined $OID;
    268     $OID =~ s/^\.+//;
    269 
    270     my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/;
    271     $tailOID = "" unless defined $tailOID;
    272 
    273     if (!defined $headOID) {
    274     $classifyinfo->{'Title'} = $title;
    275     $classifyinfo->{'classifytype'} = $classifytype;
    276     return $classifyinfo;
    277     }
    278 
    279     $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
    280     my $offset = 0;
    281     foreach my $thing (@{$classifyinfo->{'contains'}}) {
    282     $offset ++ if defined $thing->{'OID'};
    283     }
    284 
    285     while (scalar(@{$classifyinfo->{'contains'}}) < ($headOID+$offset)) {
    286     push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
    287     }
    288 
    289     return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID+$offset-1)], $title, $classifytype);
     183
     184    if (defined $self->{'subjectfile'}) {
     185    $self->hfile_classify($doc_obj,$sortmeta,$metavalues);
     186    }
     187    else {
     188    $self->auto_classify($doc_obj,$nosort,$sortmeta,$metavalues);
     189    }
     190}
     191
     192sub add_To_Hash {
     193    my $self = shift (@_);
     194    my ($myhash, $k, $nosort) = @_;
     195   
     196    if (!defined $myhash->{$k}){
     197    $myhash->{$k}={};
     198    $myhash->{$k}->{'nodes'}={};
     199    if ($nosort) {
     200        $myhash->{$k}->{'docs'}=[];
     201    } else {
     202        $myhash->{$k}->{'docs'} = {};
     203    }
     204    }
     205    return $myhash->{$k};
     206}
     207
     208sub print_Hash{
     209    my $self = shift (@_);
     210    my ($myHash, $num_spaces) = @_;
     211
     212    foreach my $key (keys %{$myHash}){
     213    print "\n";
     214    $self->print_spaces($num_spaces);
     215    print STDERR "$key*";
     216    $self->print_Hash($myHash->{$key}, $num_spaces + 2);
     217    }   
     218}
     219
     220sub print_spaces{
     221    my $self = shift (@_);
     222    my ($num_spaces) = @_;
     223   
     224    for ($i = 0; $i < $num_spaces; $i++){
     225    print STDERR " ";
     226    }
    290227}
    291228
     
    293230    my $self = shift (@_);
    294231    my ($title, $childtype, $thistype) = @_;
    295     my $memberof = &supports_memberof();
    296232   
    297233    # organise into classification structure
    298234    my %classifyinfo = ('childtype'=>$childtype,
    299235            'Title'=>$title,
    300             'supportsmemberof'=>$memberof,
    301             'contains'=>[]);
     236            'contains'=>[],);
    302237    $classifyinfo{'thistype'} = $thistype
    303238    if defined $thistype && $thistype =~ /\w/;
    304 
     239   
    305240    return \%classifyinfo;
    306241}
    307242
     243sub process_hash {
     244    my $self = shift (@_);
     245    my ($top_hash, $top_entry) = @_;   
     246    my ($entry);
     247   
     248    my $hash = {};
     249    foreach my $key (sort keys %{$top_hash}) {
     250    $entry = $self->get_entry($key,"VList","VList");
     251    my $has_content = 0;
     252    my @doc_list;
     253    # generate a sorted list of doc ids
     254    if ($nosort && scalar(@{$top_hash->{$key}->{'docs'}})) {
     255        @doc_list = @{$top_hash->{$key}->{'docs'}};
     256    } elsif (!$nosort && (keys %{$top_hash->{$key}->{'docs'}})) {
     257        @doc_list = sort {$top_hash->{$key}->{'docs'}->{$a}
     258                  cmp $top_hash->{$key}->{'docs'}->{$b};} keys %{$top_hash->{$key}->{'docs'}};
     259       
     260    }
     261    # if this key has documents, add them
     262    if (@doc_list) {
     263        $has_content = 1;
     264        foreach $d(@doc_list) {
     265        push (@{$entry->{'contains'}}, {'OID'=>$d});
     266        }   
     267    }
     268    # if this key has nodes, add them
     269    if (scalar(keys %{$top_hash->{$key}->{'nodes'}})) {
     270        $has_content = 1;
     271        $self->process_hash($top_hash->{$key}->{'nodes'}, $entry);
     272    }
     273    # if we have found some content, add the new entry for this key into the parent node
     274    if ($has_content) {
     275        push (@{$top_entry->{'contains'}}, $entry);
     276    }
     277
     278    }   
     279}
     280
     281sub auto_get_classify_info {
     282    my $self = shift (@_);
     283    my ($no_thistype) = @_;
     284    $no_thistype = 0 unless defined $no_thistype;
     285
     286    my ($classification);
     287    my $top_h = $self->{'path_hash'};
     288
     289    if ($self->{'path_hash'}) {
     290    if ($self->{'hlist_at_top'}) {
     291        $classification = $self->get_entry ($self->{'title'}, "HList", "Invisible");
     292    }
     293    else {
     294        $classification = $self->get_entry ($self->{'title'}, "VList", "Invisible");
     295    }
     296    }
     297
     298    $self->process_hash($top_h, $classification);
     299   
     300    return  $classification;
     301
     302}
     303
     304sub auto_get_classify_info
     305{
     306    my $self = shift (@_);
     307    my ($classifyinfo) = @_;
     308
     309    $self->process_hash($self->{'path_hash'}, $classifyinfo);
     310
     311    return $classifyinfo;
     312}
     313
     314
     315sub get_classify_info {
     316    my $self = shift (@_);
     317
     318    my ($classifyinfo);
     319
     320    if ($self->{'hlist_at_top'}) {
     321    $classifyinfo = $self->get_entry ($self->{'title'}, "HList", "Invisible");
     322    }
     323    else {
     324    $classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible");
     325    }
     326
     327    if (defined $self->{'subjectfile'}) {
     328    return $self->hfile_get_classify_info($classifyinfo);
     329    }
     330    else {
     331    return $self->auto_get_classify_info($classifyinfo);
     332    }
     333}
     334
    308335
    3093361;
     337
     338
     339
Note: See TracChangeset for help on using the changeset viewer.