########################################################################### # # AZCompactList.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # classifier plugin for sorting alphabetically # options are: # # metadata=Metaname -- all documents with Metaname metadata # will be included in list, list will be sorted # by this element. # buttonname=Title -- (optional) the title field for this classification. # if not included title field will be Metaname. # mingroup=Num -- (optional) the smallest value that will cause # a group in the hierarchy to form. # minnesting=Num -- (optional) the smallest value that will cause a # list to converted into nested list # mincompact=Num -- (optional) used in compact list # maxcompact=Num -- (optional) used in compact list # doclevel=top|section -- (optional) level to process document at. # onlyfirst -- (optional) control whether all or only first # metadata value used from array of metadata package AZCompactList; use BasClas; use sorttools; sub BEGIN { @ISA = ('BasClas'); } my $doclevel_list = [ { 'name' => "top", 'desc' => "Whole document." } , { 'name' => "section", 'desc' => "By sections." } ]; my $arguments = [ { 'name' => "metadata", 'desc' => "Metadata field used for classification. List will be sorted by this element.", 'type' => "metadata", 'reqd' => "yes" } , { 'name' => "buttonname", 'desc' => "Button name for this classification. Defaults to metadata name.", 'type' => "string", 'reqd' => "no" } , { 'name' => "mingroup", 'desc' => "The smallest value that will cause a group in the hierarchy to form.", 'type' => "int", 'reqd' => "no" } , { 'name' => "minnesting", 'desc' => "The smallest value that will cause a list to converted into nested list.", 'type' => "int", 'reqd' => "no" } , { 'name' => "mincompact", 'desc' => "Used in compact list.", 'type' => "int", 'reqd' => "no" } , { 'name' => "maxcompact", 'desc' => "Used in compact list.", 'type' => "int", 'reqd' => "no" } , { 'name' => "doclevel", 'desc' => "Level to process document at.", 'type' => "enum", 'list' => $doclevel_list, 'reqd' => "no" } , { 'name' => "onlyfirst", 'desc' => "Control whether all or only first metadata value used from array of metadata.", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "AZCompactList", 'desc' => "Classifier plugin for sorting alphabetically", 'inherits' => "Yes", 'args' => $arguments }; sub print_usage { print STDERR " usage: classify AZCompactList -metadata X [options] options: -metadata X (required) Metadata field used for classification -buttonname X Title to use on web pages (defaults to metadata) -removeprefix regex pattern to remove from metadata before sorting -doclevel top|section (Defaults to top) -freqsort Sort by node frequency rather than alpha-numeric -mingroup N Minimum num of documents required to form a new group -minnesting N Minimum list size to become a nested list -mincompact N Used in compact list -maxcompact N Used in compact list -onlyfirst Only use the first value if metadata is repeated. -recopt "; } sub new { my $class = shift (@_); my $self = new BasClas($class, @_); # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); my ($metaname, $title, $removeprefix); my $mingroup = 2; my $minnesting = 20; my $mincompact = 10; my $maxcompact = 30; my $doclevel = "top"; my $onlyfirst = 0; my $freqsort = 0; my $recopt = undef; if (!parsargv::parse(\@_, q^metadata/.*/^, \$metaname, q^buttonname/.*/^, \$title, q^removeprefix/.*/^, \$removeprefix, q^mingroup/.*/2^, \$mingroup, q^minnesting/.*/20^, \$minnesting, q^mincompact/.*/10^, \$mincompact, q^maxcompact/.*/30^, \$maxcompact, q^doclevel/.*/top^, \$doclevel, q^onlyfirst/.*/0^, \$onlyfirst, q^freqsort/.*/0^, \$freqsort, q^recopt/.*/-1^, \$recopt, "allow_extra_options")) { print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n"; &print_usage(); die "\n"; } if (!defined $metaname) { my $outhandle = $self->{'outhandle'}; print $outhandle "AZCompactList used with no metadata name to classify by\n"; die "\n"; } $title = $metaname unless ($title); $self->{'list'} = {}; $self->{'listmetavalue'} = {}; $self->{'reclassify'} = {}; $self->{'reclassifylist'} = {}; $self->{'metaname'} = $metaname; $self->{'title'} = "$title"; # title for the titlebar. if (defined($removeprefix) && $removeprefix) { $self->{'removeprefix'} = $removeprefix; } $self->{'mingroup'} = $mingroup; $self->{'minnesting'} = $minnesting; $self->{'mincompact'} = $mincompact; $self->{'maxcompact'} = $maxcompact; $self->{'doclevel'} = $doclevel; if ($onlyfirst != 0) { $onlyfirst = 1; } $self->{'onlyfirst'} = $onlyfirst; if ($freqsort != 0) { $freqsort = 1; } $self->{'freqsort'} = $freqsort; if ($recopt == -1) { $recopt = undef; } else { $recopt = "on"; } $self->{'recopt'} = $recopt; return bless $self, $class; } sub init { my $self = shift (@_); $self->{'list'} = {}; $self->{'listmetavalue'} = {}; $self->{'reclassify'} = {}; $self->{'reclassifylist'} = {}; } $tmp = 0; sub classify { my $self = shift (@_); my ($doc_obj) = @_; my $doc_OID = $doc_obj->get_OID(); my @sectionlist = (); my $topsection = $doc_obj->get_top_section(); my $metaname = $self->{'metaname'}; my $outhandle = $self->{'outhandle'}; $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list if ($self->{'doclevel'} =~ /^top(level)?/i) { push(@sectionlist,$topsection); } else { my $thissection = $doc_obj->get_next_section($topsection); while (defined $thissection) { push(@sectionlist,$thissection); $thissection = $doc_obj->get_next_section ($thissection); } } my $thissection; foreach $thissection (@sectionlist) { my $full_doc_OID = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID; if (defined $self->{'list'}->{$full_doc_OID}) { print $outhandle "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n"; } $self->{'list'}->{$full_doc_OID} = []; $self->{'listmetavalue'}->{$full_doc_OID} = []; my $metavalues = $doc_obj->get_metadata($thissection,$metaname); my $metavalue; foreach $metavalue (@$metavalues) { # if this document doesn't contain the metadata element we're # sorting by we won't include it in this classification if (defined $metavalue && $metavalue =~ /\w/) { if ($self->{'removeprefix'}) { $metavalue =~ s/^$self->{'removeprefix'}//; } my $formatted_metavalue = $metavalue; if ($self->{'metaname'} =~ m/^Creator(:.*)?$/) { &sorttools::format_string_name_english (\$formatted_metavalue); } else { &sorttools::format_string_english (\$formatted_metavalue); } #### prefix-str if (! defined($formatted_metavalue)) { print $outhandle "Warning: AZCompactList: metavalue is "; print $outhandle "empty\n"; $formatted_metavalue=""; } push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue); push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue); last if ($self->{'onlyfirst'}); } } my $date = $doc_obj->get_metadata_element($thissection,"Date"); $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date]; } } sub reinit { my ($self,$classlist_ref) = @_; my $outhandle = $self->{'outhandle'}; my %mtfreq = (); my @single_classlist = (); my @multiple_classlist = (); # find out how often each metavalue occurs map { my $mv; foreach $mv (@{$self->{'listmetavalue'}->{$_}} ) { $mtfreq{$mv}++; } } @$classlist_ref; # use this information to split the list: single metavalue/repeated value map { my $i = 1; my $metavalue; foreach $metavalue (@{$self->{'listmetavalue'}->{$_}}) { if ($mtfreq{$metavalue} >= $self->{'mingroup'}) { push(@multiple_classlist,[$_,$i,$metavalue]); } else { push(@single_classlist,[$_,$metavalue]); $metavalue =~ tr/[A-Z]/[a-z]/; $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue; } $i++; } } @$classlist_ref; # Setup sub-classifiers for multiple list $self->{'classifiers'} = {}; my $pm; foreach $pm ("List", "SectionList") { my $listname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm"); if (-e $listname) { require $listname; } else { print $outhandle "AZCompactList ERROR - couldn't find classifier \"$listname\"\n"; die "\n"; } } # Create classifiers objects for each entry >= mingroup my $metavalue; foreach $metavalue (keys %mtfreq) { if ($mtfreq{$metavalue} >= $self->{'mingroup'}) { my $listclassobj; my $doclevel = $self->{'doclevel'}; my $metaname = $self->{'metaname'}; my @metaname_list = split('/',$metaname); $metaname = shift(@metaname_list); if (@metaname_list==0) { my @args; push @args, ("-metadata", "$metaname"); # buttonname is also used for the node's title push @args, ("-buttonname", "$metavalue"); push @args, ("-sort", "Date"); if ($doclevel =~ m/^top(level)?/i) { eval ("\$listclassobj = new List(\@args)"); warn $@ if $@; } else { eval ("\$listclassobj = new SectionList(\@args)"); } } else { $metaname = join('/',@metaname_list); my @args; push @args, ("-metadata", "$metaname"); # buttonname is also used for the node's title push @args, ("-buttonname", "$metavalue"); push @args, ("-doclevel", "$doclevel"); push @args, "-recopt"; eval ("\$listclassobj = new AZCompactList(\@args)"); } if ($@) { print $outhandle "$@"; die "\n"; } $listclassobj->init(); if (defined $metavalue && $metavalue =~ /\w/) { my $formatted_node = $metavalue; if ($self->{'removeprefix'}) { $formatted_node =~ s/^$self->{'removeprefix'}//; } if ($self->{'metaname'} =~ m/^Creator(:.*)?$/) { &sorttools::format_string_name_english(\$formatted_node); } else { &sorttools::format_string_english(\$formatted_node); } # In case our formatted string is empty... if (! defined($formatted_node)) { print $outhandle "Warning: AZCompactList: metavalue is "; print $outhandle "empty\n"; $formatted_node=""; } $self->{'classifiers'}->{$metavalue} = { 'classifyobj' => $listclassobj, 'formattednode' => $formatted_node }; } } } return (\@single_classlist,\@multiple_classlist); } sub reclassify { my ($self,$multiple_cl_ref) = @_; # Entries in the current classify list that are "book nodes" # should be recursively classified. #-- foreach $dm_pair (@$multiple_cl_ref) { my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair; my $listclassobj; # find metavalue in list of sub-classifiers my $found = 0; my $node_name; foreach $node_name (keys %{$self->{'classifiers'}}) { $resafe_node_name = $node_name; # escape chars that mean something to perl... $resafe_node_name =~ s/([\\\(\)\[\]\{\}\^\$\.\+\*\?\|])/\\$1/g; if ($metavalue =~ m/^$resafe_node_name$/i) { my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}}; ## date appears to not be used in classifier call #### if ($doc_OID =~ m/^[^\.]*\.([\d\.]+)$/) { my $section=$1; if ($self->{'doclevel'} =~ m/^top/i) { # toplevel $self->{'classifiers'}->{$node_name}->{'classifyobj'} ->classify($doc_obj, "Section=$section"); } else { # section level # Thanks to Don Gourley for this... # classify can't handle multi-level section $self->{'classifiers'}->{$node_name}->{'classifyobj'} ->classify_section($section, $doc_obj, $date); } } else { $self->{'classifiers'}->{$node_name}->{'classifyobj'} ->classify($doc_obj); } $found = 1; last; } } if (!$found) { my $outhandle=$self->{outhandle}; print $outhandle "Warning: AZCompactList::reclassify "; print $outhandle "could not find sub-node for metadata=`$metavalue' with doc_OID $doc_OID\n"; } } } sub get_reclassify_info { my $self = shift (@_); my $node_name; foreach $node_name (keys %{$self->{'classifiers'}}) { my $classifyinfo = $self->{'classifiers'}->{$node_name}->{'classifyobj'} ->get_classify_info(); $self->{'classifiers'}->{$node_name}->{'classifyinfo'} = $classifyinfo; $self->{'reclassifylist'}->{"CLASSIFY.$node_name"} = $self->{'classifiers'}->{$node_name}->{'formattednode'}; } } sub alpha_numeric_cmp { my ($self,$a,$b) = @_; my $title_a = $self->{'reclassifylist'}->{$a}; my $title_b = $self->{'reclassifylist'}->{$b}; if ($title_a =~ m/^(\d+(\.\d+)?)/) { my $val_a = $1; if ($title_b =~ m/^(\d+(\.\d+)?)/) { my $val_b = $1; if ($val_a != $val_b) { return ($val_a <=> $val_b); } } } return ($title_a cmp $title_b); } sub frequency_cmp { my ($self,$a,$b) = @_; my $title_a = $self->{'reclassifylist'}->{$a}; my $title_b = $self->{'reclassifylist'}->{$b}; my $a_freq = 1; my $b_freq = 1; if ($a =~ m/^CLASSIFY\.(.*)$/) { my $a_node = $1; my $a_nodeinfo = $self->{'classifiers'}->{$a_node}->{'classifyinfo'}; $a_freq = scalar(@{$a_nodeinfo->{'contains'}}); } if ($b =~ m/^CLASSIFY\.(.*)$/) { my $b_node = $1; my $b_nodeinfo = $self->{'classifiers'}->{$b_node}->{'classifyinfo'}; $b_freq = scalar(@{$b_nodeinfo->{'contains'}}); } return $b_freq <=> $a_freq; } sub get_classify_info { my $self = shift (@_); my @classlist =keys %{$self->{'list'}}; # list all doc oids my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist); $self->reclassify($multiple_cl_ref); $self->get_reclassify_info(); my @reclassified_classlist; if ($self->{'freqsort'}) { @reclassified_classlist = sort { $self->frequency_cmp($a,$b) } keys %{$self->{'reclassifylist'}}; # supress sub-grouping by alphabet map { $self->{'reclassifylist'}->{$_} = "A".$self->{'reclassifylist'}; } keys %{$self->{'reclassifylist'}}; } else { # @reclassified_classlist # = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}}; # alpha_numeric_cmp is slower than "cmp" but handles numbers better ... @reclassified_classlist = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}}; } return $self->splitlist (\@reclassified_classlist); } sub get_entry { my $self = shift (@_); my ($title, $childtype, $metaname, $thistype) = @_; # organise into classification structure my %classifyinfo = ('childtype'=>$childtype, 'Title'=>$title, 'contains'=>[], 'mdtype'=>$metaname); $classifyinfo{'thistype'} = $thistype if defined $thistype && $thistype =~ /\w/; return \%classifyinfo; } # splitlist takes an ordered list of classifications (@$classlistref) and # splits it up into alphabetical sub-sections. sub splitlist { my $self = shift (@_); my ($classlistref) = @_; my $classhash = {}; # top level my @metanames = split("/",$self->{'metaname'}); my $metaname = shift(@metanames); my $childtype = "HList"; $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'}); my $classifyinfo; if (!defined($self->{'recopt'})) { my $title = $self->{'title'}; # should always be defined by now.... $title = $metaname unless defined $title; $classifyinfo = $self->get_entry ($title, $childtype, $metaname, "Invisible"); } else { my $title = $self->{'title'}; $classifyinfo = $self->get_entry ($title, $childtype, $metaname, "VList"); } # don't need to do any splitting if there are less than 'minnesting' classifications if ((scalar @$classlistref) <= $self->{'minnesting'}) { foreach $subOID (@$classlistref) { if ($subOID =~ /^CLASSIFY\.(.*)$/ && defined $self->{'classifiers'}->{$1}) { push (@{$classifyinfo->{'contains'}}, $self->{'classifiers'}->{$1}->{'classifyinfo'}); } else { $subOID =~ s/^Metavalue_(\d+)\.//; my $metaname_offset = $1 -1; my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset}; push (@{$classifyinfo->{'contains'}}, $oid_rec); } } return $classifyinfo; } # first split up the list into separate A-Z and 0-9 classifications foreach $classification (@$classlistref) { my $title = $self->{'reclassifylist'}->{$classification}; $title =~ s/&(.){2,4};//g; # remove any HTML special chars ### $title =~ s/^\s+//g; # remove a leading spaces ### $title =~ s/^_+//g; # remove a leading underscores $title =~ s/^\W+//g; ### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation # only want first character for classification $title =~ m/^(.)/; $title=$1; $title =~ tr/[a-z]/[A-Z]/; if ($title =~ /^[0-9]$/) {$title = '0-9';} elsif ($title !~ /^[A-Z]$/) { my $outhandle = $self->{'outhandle'}; print $outhandle "AZCompactList: WARNING $classification has badly formatted title ($title)\n"; } $classhash->{$title} = [] unless defined $classhash->{$title}; push (@{$classhash->{$title}}, $classification); } $classhash = $self->compactlist ($classhash); my @tmparr = (); foreach $subsection (sort keys (%$classhash)) { push (@tmparr, $subsection); } # if there's a 0-9 section it will have been sorted to the beginning # but we want it at the end if ($tmparr[0] eq '0-9') { shift @tmparr; push (@tmparr, '0-9'); } foreach $subclass (@tmparr) { my $tempclassify = (scalar(@tmparr)==1) ? ($self->get_entry(" ", "VList", $metaname)) : ($self->get_entry($subclass, "VList", $metaname)); foreach $subsubOID (@{$classhash->{$subclass}}) { if ($subsubOID =~ /^CLASSIFY\.(.*)$/ && defined $self->{'classifiers'}->{$1}) { push (@{$tempclassify->{'contains'}}, $self->{'classifiers'}->{$1}->{'classifyinfo'}); } else { $subsubOID =~ s/^Metavalue_(\d+)\.//; my $metaname_offset = $1 -1; my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset}; push (@{$tempclassify->{'contains'}}, $oid_rec); } } push (@{$classifyinfo->{'contains'}}, $tempclassify); } return $classifyinfo; } sub compactlist { my $self = shift (@_); my ($classhashref) = @_; my $compactedhash = {}; my @currentOIDs = (); my $currentfirstletter = ""; my $currentlastletter = ""; my $lastkey = ""; # minimum and maximum documents to be displayed per page. # the actual maximum will be max + (min-1). # the smallest sub-section is a single letter at present # so in this case there may be many times max documents # displayed on a page. my $min = $self->{'mincompact'}; my $max = $self->{'maxcompact'}; foreach $subsection (sort keys %$classhashref) { if ($subsection eq '0-9') { @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}}; next; } $currentfirstletter = $subsection if $currentfirstletter eq ""; if ((scalar (@currentOIDs) < $min) || ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) { push (@currentOIDs, @{$classhashref->{$subsection}}); $currentlastletter = $subsection; } else { if ($currentfirstletter eq $currentlastletter) { @{$compactedhash->{$currentfirstletter}} = @currentOIDs; $lastkey = $currentfirstletter; } else { @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs; $lastkey = "$currentfirstletter-$currentlastletter"; } if (scalar (@{$classhashref->{$subsection}}) >= $max) { $compactedhash->{$subsection} = $classhashref->{$subsection}; @currentOIDs = (); $currentfirstletter = ""; } else { @currentOIDs = @{$classhashref->{$subsection}}; $currentfirstletter = $subsection; $currentlastletter = $subsection; } } } # add final OIDs to last sub-classification if there aren't many otherwise # add final sub-classification if (scalar (@currentOIDs) < $min) { my ($newkey) = $lastkey =~ /^(.)/; @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs); delete $compactedhash->{$lastkey}; @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs; } else { if ($currentfirstletter eq $currentlastletter) { @{$compactedhash->{$currentfirstletter}} = @currentOIDs; } else { @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs; } } return $compactedhash; } 1;