Changeset 1313 for trunk


Ignore:
Timestamp:
2000-08-01T13:56:13+12:00 (24 years ago)
Author:
sjboddie
Message:

Added Davids version of AZCompactList which handles multiple value
metadata

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/AZCompactList.pm

    r1250 r1313  
    2626# classifier plugin for sorting alphabetically
    2727# options are:
     28#
    2829# metadata=Metaname -- all documents with Metaname metadata
    2930#                      will be included in list, list will be sorted
     
    3334# mingroup=Num      -- (optional) the smallest value that will cause
    3435#                      a group in the hierarchy to form.
    35 
     36# minnesting=Num    -- (optional) the smallest value that will cause a
     37#              list to converted into nested list
     38# mincompact=Num    -- (optional) used in compact list
     39# maxcompact=Num    -- (optional) used in compact list
     40# doclevel=top|section -- (optional) level to process document at.
     41# onlyfirst         -- (optional) control whether all or only first
     42#                      metadata value used from array of metadata
    3643package AZCompactList;
    3744
    38 use AZList;
    3945use sorttools;
    40 
    41 sub BEGIN {
    42     @ISA = ('AZList');
    43 }
    4446
    4547sub new {
     
    4850    my ($metaname, $title);
    4951    my $mingroup = 2;
     52    my $minnesting = 20;
     53    my $mincompact = 10;
     54    my $maxcompact = 30;
     55    my $doclevel = "top";
     56    my $onlyfirst = 0;
     57    my $recopt   = undef;
    5058
    5159    foreach $option (@options) {
     
    5664    } elsif ($option =~ /^mingroup(size)?=(.*)$/i) {
    5765        $mingroup = $2;
     66    } elsif ($option =~ /^minnesting=(.*)$/i) {
     67        $minnesting = $1;
     68    } elsif ($option =~ /^mincompact=(.*)$/i) {
     69        $mincompact = $1;
     70    } elsif ($option =~ /^maxcompact=(.*)$/i) {
     71        $maxcompact = $1;
     72    } elsif ($option =~ /^doclevel=(.*)$/i) {
     73        $doclevel = $1;
     74    } elsif ($option =~ /^onlyfirst$/i) {
     75        $onlyfirst = 1;
     76    } elsif ($option =~ /^recopt$/i) {
     77        $recopt = "on";
    5878    }
    5979    }
     
    7090    'metaname' => $metaname,
    7191    'title' => $title,
    72     'mingroup' => $mingroup
     92    'mingroup' => $mingroup,
     93    'minnesting' => $minnesting,
     94    'mincompact' => $mincompact,
     95    'maxcompact' => $maxcompact,
     96    'doclevel' => $doclevel,
     97    'onlyfirst' => $onlyfirst,
     98    'recopt' => $recopt
    7399    }, $class;
     100
     101
    74102}
    75103
     
    93121    my $doc_OID = $doc_obj->get_OID();
    94122
    95     my $thissection = $doc_obj->get_top_section();
     123    my @sectionlist = ();
     124    my $topsection = $doc_obj->get_top_section();
     125
    96126    my $metaname = $self->{'metaname'};
    97127
    98     my $metavalue = $doc_obj->get_metadata_element($thissection,$metaname);
    99     my $date = $doc_obj->get_metadata_element($thissection,"Date");
    100 
    101     # if this document doesn't contain the metadata element we're
    102     # sorting by we won't include it in this classification
    103     if (defined $metavalue && $metavalue =~ /\w/)
    104     {
    105     my $formatted_metavalue = $metavalue;
    106 
    107     if ($self->{'metaname'} eq 'Creator')
    108     {
    109         &sorttools::format_string_name_english (\$formatted_metavalue);
     128    $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list
     129##    print STDERR "AZCompactList: processing $doc_OID for $metaname\n";
     130
     131    if ($self->{'doclevel'} =~ /^top(level)?/i)
     132    {
     133    push(@sectionlist,$topsection);
     134    }
     135    else
     136    {
     137    my $thissection = $doc_obj->get_next_section($topsection);
     138    while (defined $thissection)
     139    {
     140        push(@sectionlist,$thissection);
     141        $thissection = $doc_obj->get_next_section ($thissection);
     142    }
     143    }
     144
     145    my $thissection;
     146    foreach $thissection (@sectionlist)
     147    {
     148    my $full_doc_OID
     149        = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID;
     150
     151    if (defined $self->{'list'}->{$full_doc_OID})
     152    {
     153        print STDERR "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n";
    110154    }
    111     else
    112     {
    113         &sorttools::format_string_english (\$formatted_metavalue);
    114     }
    115     if (defined $self->{'list'}->{$doc_OID})
    116     {
    117         print STDERR "WARNING: AZCompactList::classify called multiple times for $doc_OID\n";
    118     }
    119 
    120     $self->{'list'}->{$doc_OID} = $formatted_metavalue;
    121     $self->{'listmetavalue'}->{$doc_OID} = $metavalue;
    122     $self->{'reclassify'}->{$doc_OID} = [$doc_obj,$date]
     155    $self->{'list'}->{$full_doc_OID} = []; 
     156    $self->{'listmetavalue'}->{$full_doc_OID} = [];
     157
     158    my $metavalues = $doc_obj->get_metadata($thissection,$metaname);
     159    my $metavalue;
     160    foreach $metavalue (@$metavalues)
     161    {
     162###     print STDERR "$metaname :\tmetavalue = $metavalue\n";
     163        # if this document doesn't contain the metadata element we're
     164        # sorting by we won't include it in this classification
     165        if (defined $metavalue && $metavalue =~ /\w/)
     166        {
     167        my $formatted_metavalue = $metavalue;
     168        if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
     169        {
     170            &sorttools::format_string_name_english (\$formatted_metavalue);
     171        }
     172        else
     173        {
     174            &sorttools::format_string_english (\$formatted_metavalue);
     175        }
     176       
     177        push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue);
     178        push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue);
     179
     180        last if ($self->{'onlyfirst'});
     181        }
     182    }
     183    my $date = $doc_obj->get_metadata_element($thissection,"Date");
     184    $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date];
    123185    }
    124186}
     
    133195
    134196    # find out how often each metavalue occurs
    135     map { $mtfreq{$self->{'listmetavalue'}->{$_}}++; } @$classlist_ref;
     197    map
     198    {
     199    my $mv;
     200    foreach $mv (@{$self->{'listmetavalue'}->{$_}} )
     201    {
     202        $mtfreq{$mv}++;
     203    }
     204    } @$classlist_ref;
    136205
    137206    # use this information to split the list: single metavalue/repeated value
    138207    map
    139208    {
    140     my $metavalue = $self->{'listmetavalue'}->{$_};
    141     print "meta value = $metavalue; count = $mtfreq{$metavalue}\n";
    142 
    143     if ($mtfreq{$metavalue}>=$self->{'mingroup'})
    144     {
    145         push(@multiple_classlist,$_);
    146     }
    147     else
    148     {
    149         push(@single_classlist,$_);
    150         $self->{'reclassifylist'}->{$_} = $metavalue;
     209    my $i = 1;
     210    my $metavalue;
     211    foreach $metavalue (@{$self->{'listmetavalue'}->{$_}})
     212    {
     213        if ($mtfreq{$metavalue} >= $self->{'mingroup'})
     214        {
     215        push(@multiple_classlist,[$_,$i,$metavalue]);
     216        }
     217        else
     218        {
     219        push(@single_classlist,[$_,$metavalue]);
     220        $metavalue =~ tr/[A-Z]/[a-z]/;
     221        $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue;
     222        }
     223        $i++;
    151224    }
    152225    } @$classlist_ref;
     
    157230    $self->{'classifiers'} = {};
    158231
    159     my $listname
    160     = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/List.pm");
    161     if (-e $listname) { require $listname; }
    162     else
    163     {
    164     die "TCCList ERROR - couldn't find classifier \"$listname\"\n";
    165     }
    166 
     232    my $pm;
     233    foreach $pm ("List", "SectionList")
     234    {
     235    my $listname
     236        = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm");
     237    if (-e $listname) { require $listname; }
     238    else
     239    {
     240        die "AZCompactList ERROR - couldn't find classifier \"$listname\"\n";
     241    }
     242    }
     243
     244    # Create classifiers objects for each entry >= mingroup
    167245    my $metavalue;
    168246    foreach $metavalue (keys %mtfreq)
    169247    {
    170     if ($mtfreq{$metavalue}>=$self->{'mingroup'})
     248    if ($mtfreq{$metavalue} >= $self->{'mingroup'})
    171249    {
    172250        my $listclassobj;
     251        my $doclevel = $self->{'doclevel'};
    173252        my $metaname  = $self->{'metaname'};
    174        
    175         eval ("\$listclassobj = new List(\"metadata=$metaname\", \"title=\$metavalue\", \"sort=Date\")");
     253        my @metaname_list = split('/',$metaname);
     254        $metaname = shift(@metaname_list);
     255        if (@metaname_list==0)
     256        {
     257        if ($doclevel =~ m/^top(level)?/i)
     258        {
     259            my $args = "\"metadata=$metaname\"";
     260            $args .= ", \"title=\$metavalue\"";
     261            $args .= ", \"sort=Date\"";
     262            eval ("\$listclassobj = new List($args)");
     263        }
     264        else
     265        {
     266            my $args = "\"metadata=$metaname\"";
     267            $args .= ", \"title=\$metavalue\"";
     268            $args .= ", \"sort=Date\"";
     269            eval ("\$listclassobj = new SectionList($args)");
     270        }
     271        }
     272        else
     273        {
     274        $metaname = join('/',@metaname_list);
     275
     276        my $args = "\"metadata=$metaname\"";
     277        $args .= ", \"title=\$metavalue\"";
     278        $args .= ", \"doclevel=\$doclevel\"";
     279        $args .= ", \"recopt\"";
     280
     281        eval ("\$listclassobj = new AZCompactList($args)");
     282        }
    176283        die "$@" if $@;
    177284       
     
    181288        {
    182289        my $formatted_node = $metavalue;
    183         if ($self->{'metaname'} eq 'Creator')
     290        if ($self->{'metaname'} =~ m/^Creator(:.*)?$/)
    184291        {
    185292            &sorttools::format_string_name_english(\$formatted_node);
     
    206313    my ($self,$multiple_cl_ref) = @_;
    207314
    208     my $metaname  = $self->{'metaname'};
    209 
    210     my $doc_OID;
    211     foreach $doc_OID (@$multiple_cl_ref)
    212     {
     315    # Entries in the current classify list that are "book nodes"
     316    # should be recursively classified.
     317    #--
     318    foreach $dm_pair (@$multiple_cl_ref)
     319    {
     320    my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair;
    213321        my $listclassobj;
    214         my $metavalue = $self->{'listmetavalue'}->{$doc_OID};
    215    
     322
    216323    # find metavalue in list of sub-classifiers
    217324    my $found = 0;
     
    219326    foreach $node_name (keys %{$self->{'classifiers'}})
    220327    {
    221         if ($metavalue =~ /^$node_name$/i)
     328        $resafe_node_name = $node_name;
     329        $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g;
     330        if ($metavalue =~ m/^$resafe_node_name$/i)
    222331        {
    223332        my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}};
    224333
    225         $self->{'classifiers'}->{$node_name}->{'classifyobj'}
    226               ->classify($doc_obj, $date);
    227 
     334        ## date appears to not be used in classifier call ####
     335
     336        if ($doc_OID =~ m/^.*\.(\d+)$/)
     337        {
     338            $self->{'classifiers'}->{$node_name}->{'classifyobj'}
     339            ->classify($doc_obj, "Section=$1");
     340        }
     341        else
     342        {
     343            $self->{'classifiers'}->{$node_name}->{'classifyobj'}
     344            ->classify($doc_obj);
     345        }
     346       
    228347        $found = 1;
    229348        last;
    230349        }
    231350    }
    232 
     351   
    233352    if (!$found)
    234353    {
    235354        print STDERR "Warning: AZCompactList::reclassify ";
    236         print STDERR "could not find sub-node for $metavalue\n";
     355        print STDERR "could not find sub-node for $metavalue with doc_OID $doc_OID\n";
    237356    }
    238357    }
     
    250369        my $classifyinfo
    251370        = $self->{'classifiers'}->{$node_name}->{'classifyobj'}
    252             ->get_classify_info(1);
     371            ->get_classify_info();
    253372        $self->{'classifiers'}->{$node_name}->{'classifyinfo'}
    254373        = $classifyinfo;
     
    259378
    260379
     380sub alpha_numeric_cmp
     381{
     382    my ($self,$a,$b) = @_;
     383
     384    my $title_a = $self->{'reclassifylist'}->{$a};
     385    my $title_b = $self->{'reclassifylist'}->{$b};
     386
     387    if ($title_a =~ m/^(\d+(\.\d+)?)/)
     388    {
     389    my $val_a = $1;
     390    if ($title_b =~ m/^(\d+(\.\d+)?)/)
     391    {
     392        my $val_b = $1;
     393        if ($val_a != $val_b)
     394        {
     395        return ($val_a <=> $val_b);
     396        }
     397    }
     398    }
     399   
     400    return ($title_a cmp $title_b);
     401}
     402
    261403sub get_classify_info {
    262404    my $self = shift (@_);
    263405
    264     my @classlist = sort {$self->{'list'}->{$a} cmp $self->{'list'}->{$b};} keys %{$self->{'list'}};
     406    my @classlist =keys %{$self->{'list'}}; # list all doc oids
    265407
    266408    my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist);
     
    268410    $self->get_reclassify_info();
    269411
     412
     413#    my @reclassified_classlist
     414#   = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
     415
     416    # alpha_numeric_cmp is slower but handles numbers better ...
    270417    my @reclassified_classlist
    271     = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}};
     418    = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}};
     419
    272420
    273421    return $self->splitlist (\@reclassified_classlist);
    274422}
    275423
    276 # splitlist takes an ordered list of classifications (@$classlistref) and splits it
    277 # up into alphabetical sub-sections.
     424sub get_entry {
     425    my $self = shift (@_);
     426    my ($title, $childtype, $metaname, $thistype) = @_;
     427
     428    # organise into classification structure
     429    my %classifyinfo = ('childtype'=>$childtype,
     430                        'Title'=>$title,
     431                        'contains'=>[],
     432            'mdtype'=>$metaname);
     433
     434    $classifyinfo{'thistype'} = $thistype
     435        if defined $thistype && $thistype =~ /\w/;
     436
     437    return \%classifyinfo;
     438}
     439
     440
     441
     442# splitlist takes an ordered list of classifications (@$classlistref) and
     443# splits it up into alphabetical sub-sections.
    278444sub splitlist {
    279445    my $self = shift (@_);
     
    282448
    283449    # top level
     450    my @metanames = split("/",$self->{'metaname'});
     451    my $metaname = shift(@metanames);
     452
    284453    my $childtype = "HList";
    285     if (scalar (@$classlistref) <= 39) {$childtype = "VList";}
    286     my $classifyinfo = $self->get_entry ($self->{'title'}, $childtype, "Invisible");
    287 
    288     # don't need to do any splitting if there are less than 39 (max + min-1) classifications
    289     if ((scalar @$classlistref) <= 39) {
     454    $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'});
     455
     456    my $classifyinfo;
     457    if (!defined($self->{'recopt'}))
     458    {
     459    my $title = $metaname;
     460    $classifyinfo
     461        = $self->get_entry ($metaname, $childtype, $metaname, "Invisible");
     462    }
     463    else
     464    {
     465    my $title = $self->{'title'};
     466    $classifyinfo
     467        = $self->get_entry ($title, $childtype, $metaname, "VList");
     468    }
     469
     470    # don't need to do any splitting if there are less than 'minnesting' classifications
     471    if ((scalar @$classlistref) <= $self->{'minnesting'}) {
    290472    foreach $subOID (@$classlistref) {
    291473            if ($subOID =~ /^CLASSIFY\.(.*)$/
    292474        && defined $self->{'classifiers'}->{$1})
    293475        {
     476###     print STDERR "*** subOID = $subOID\n";
     477
    294478                push (@{$classifyinfo->{'contains'}},
    295479              $self->{'classifiers'}->{$1}->{'classifyinfo'});
     
    297481        else
    298482        {
    299         push (@{$classifyinfo->{'contains'}}, {'OID'=>$subOID});
     483        $subOID =~ s/^Metavalue_(\d+)\.//;
     484        my $metaname_offset = $1 -1;
     485        my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset};
     486        push (@{$classifyinfo->{'contains'}}, $oid_rec);
    300487        }
    301488    }
     
    307494    my $title = $self->{'reclassifylist'}->{$classification};
    308495    $title =~ s/&(.){2,4};//g; # remove any HTML special chars
    309     $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
     496### $title =~ s/^\s+//g; # remove a leading spaces
     497### $title =~ s/^_+//g; # remove a leading underscores
     498    $title =~ s/^\W+//g;
     499### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation
    310500    $title =~ s/^(.).*$/$1/;
    311501    $title =~ tr/[a-z]/[A-Z]/;
     
    334524    foreach $subclass (@tmparr)
    335525    {
    336     my $tempclassify = $self->get_entry ($subclass, "VList");
     526    my $tempclassify = $self->get_entry($subclass, "VList", $metaname);
    337527    foreach $subsubOID (@{$classhash->{$subclass}})
    338528    {
     
    345535        else
    346536        {
    347         push (@{$tempclassify->{'contains'}}, {'OID'=>$subsubOID});
     537        $subsubOID =~ s/^Metavalue_(\d+)\.//;
     538        my $metaname_offset = $1 -1;
     539        my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset};
     540        push (@{$tempclassify->{'contains'}}, $oid_rec);
    348541        }
    349542    }
     
    354547}
    355548
     549sub compactlist {
     550    my $self = shift (@_);
     551    my ($classhashref) = @_;
     552    my $compactedhash = {};
     553    my @currentOIDs = ();
     554    my $currentfirstletter = "";
     555    my $currentlastletter = "";
     556    my $lastkey = "";
     557
     558    # minimum and maximum documents to be displayed per page.
     559    # the actual maximum will be max + (min-1).
     560    # the smallest sub-section is a single letter at present
     561    # so in this case there may be many times max documents
     562    # displayed on a page.
     563    my $min = $self->{'mincompact'};
     564    my $max = $self->{'maxcompact'};
     565
     566    foreach $subsection (sort keys %$classhashref) {
     567    if ($subsection eq '0-9') {
     568        @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}};
     569        next;
     570    }
     571    $currentfirstletter = $subsection if $currentfirstletter eq "";
     572    if ((scalar (@currentOIDs) < $min) ||
     573        ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
     574        push (@currentOIDs, @{$classhashref->{$subsection}});
     575        $currentlastletter = $subsection;
     576    } else {
     577
     578        if ($currentfirstletter eq $currentlastletter) {
     579        @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
     580        $lastkey = $currentfirstletter;
     581        } else {
     582        @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
     583        $lastkey = "$currentfirstletter-$currentlastletter";
     584        }
     585        if (scalar (@{$classhashref->{$subsection}}) >= $max) {
     586        $compactedhash->{$subsection} = $classhashref->{$subsection};
     587        @currentOIDs = ();
     588        $currentfirstletter = "";
     589        } else {
     590        @currentOIDs = @{$classhashref->{$subsection}};
     591        $currentfirstletter = $subsection;
     592        $currentlastletter = $subsection;
     593        }
     594    }
     595    }
     596
     597    # add final OIDs to last sub-classification if there aren't many otherwise
     598    # add final sub-classification
     599    if (scalar (@currentOIDs) < $min) {
     600    my ($newkey) = $lastkey =~ /^(.)/;
     601    @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
     602    delete $compactedhash->{$lastkey};
     603    @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs;
     604    } else {
     605    if ($currentfirstletter eq $currentlastletter) {
     606        @{$compactedhash->{$currentfirstletter}} = @currentOIDs;
     607    } else {
     608        @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs;
     609    }
     610    }
     611
     612    return $compactedhash;
     613}
     614
    3566151;
     616
     617
Note: See TracChangeset for help on using the changeset viewer.