- Timestamp:
- 2000-08-01T13:56:13+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/AZCompactList.pm
r1250 r1313 26 26 # classifier plugin for sorting alphabetically 27 27 # options are: 28 # 28 29 # metadata=Metaname -- all documents with Metaname metadata 29 30 # will be included in list, list will be sorted … … 33 34 # mingroup=Num -- (optional) the smallest value that will cause 34 35 # a group in the hierarchy to form. 35 36 # minnesting=Num -- (optional) the smallest value that will cause a 37 # list to converted into nested list 38 # mincompact=Num -- (optional) used in compact list 39 # maxcompact=Num -- (optional) used in compact list 40 # doclevel=top|section -- (optional) level to process document at. 41 # onlyfirst -- (optional) control whether all or only first 42 # metadata value used from array of metadata 36 43 package AZCompactList; 37 44 38 use AZList;39 45 use sorttools; 40 41 sub BEGIN {42 @ISA = ('AZList');43 }44 46 45 47 sub new { … … 48 50 my ($metaname, $title); 49 51 my $mingroup = 2; 52 my $minnesting = 20; 53 my $mincompact = 10; 54 my $maxcompact = 30; 55 my $doclevel = "top"; 56 my $onlyfirst = 0; 57 my $recopt = undef; 50 58 51 59 foreach $option (@options) { … … 56 64 } elsif ($option =~ /^mingroup(size)?=(.*)$/i) { 57 65 $mingroup = $2; 66 } elsif ($option =~ /^minnesting=(.*)$/i) { 67 $minnesting = $1; 68 } elsif ($option =~ /^mincompact=(.*)$/i) { 69 $mincompact = $1; 70 } elsif ($option =~ /^maxcompact=(.*)$/i) { 71 $maxcompact = $1; 72 } elsif ($option =~ /^doclevel=(.*)$/i) { 73 $doclevel = $1; 74 } elsif ($option =~ /^onlyfirst$/i) { 75 $onlyfirst = 1; 76 } elsif ($option =~ /^recopt$/i) { 77 $recopt = "on"; 58 78 } 59 79 } … … 70 90 'metaname' => $metaname, 71 91 'title' => $title, 72 'mingroup' => $mingroup 92 'mingroup' => $mingroup, 93 'minnesting' => $minnesting, 94 'mincompact' => $mincompact, 95 'maxcompact' => $maxcompact, 96 'doclevel' => $doclevel, 97 'onlyfirst' => $onlyfirst, 98 'recopt' => $recopt 73 99 }, $class; 100 101 74 102 } 75 103 … … 93 121 my $doc_OID = $doc_obj->get_OID(); 94 122 95 my $thissection = $doc_obj->get_top_section(); 123 my @sectionlist = (); 124 my $topsection = $doc_obj->get_top_section(); 125 96 126 my $metaname = $self->{'metaname'}; 97 127 98 my $metavalue = $doc_obj->get_metadata_element($thissection,$metaname); 99 my $date = $doc_obj->get_metadata_element($thissection,"Date"); 100 101 # if this document doesn't contain the metadata element we're 102 # sorting by we won't include it in this classification 103 if (defined $metavalue && $metavalue =~ /\w/) 104 { 105 my $formatted_metavalue = $metavalue; 106 107 if ($self->{'metaname'} eq 'Creator') 108 { 109 &sorttools::format_string_name_english (\$formatted_metavalue); 128 $metaname =~ s/(\/.*)//; # grab first name in n1/n2/n3 list 129 ## print STDERR "AZCompactList: processing $doc_OID for $metaname\n"; 130 131 if ($self->{'doclevel'} =~ /^top(level)?/i) 132 { 133 push(@sectionlist,$topsection); 134 } 135 else 136 { 137 my $thissection = $doc_obj->get_next_section($topsection); 138 while (defined $thissection) 139 { 140 push(@sectionlist,$thissection); 141 $thissection = $doc_obj->get_next_section ($thissection); 142 } 143 } 144 145 my $thissection; 146 foreach $thissection (@sectionlist) 147 { 148 my $full_doc_OID 149 = ($thissection ne "") ? "$doc_OID.$thissection" : $doc_OID; 150 151 if (defined $self->{'list'}->{$full_doc_OID}) 152 { 153 print STDERR "WARNING: AZCompactList::classify called multiple times for $full_doc_OID\n"; 110 154 } 111 else 112 { 113 &sorttools::format_string_english (\$formatted_metavalue); 114 } 115 if (defined $self->{'list'}->{$doc_OID}) 116 { 117 print STDERR "WARNING: AZCompactList::classify called multiple times for $doc_OID\n"; 118 } 119 120 $self->{'list'}->{$doc_OID} = $formatted_metavalue; 121 $self->{'listmetavalue'}->{$doc_OID} = $metavalue; 122 $self->{'reclassify'}->{$doc_OID} = [$doc_obj,$date] 155 $self->{'list'}->{$full_doc_OID} = []; 156 $self->{'listmetavalue'}->{$full_doc_OID} = []; 157 158 my $metavalues = $doc_obj->get_metadata($thissection,$metaname); 159 my $metavalue; 160 foreach $metavalue (@$metavalues) 161 { 162 ### print STDERR "$metaname :\tmetavalue = $metavalue\n"; 163 # if this document doesn't contain the metadata element we're 164 # sorting by we won't include it in this classification 165 if (defined $metavalue && $metavalue =~ /\w/) 166 { 167 my $formatted_metavalue = $metavalue; 168 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/) 169 { 170 &sorttools::format_string_name_english (\$formatted_metavalue); 171 } 172 else 173 { 174 &sorttools::format_string_english (\$formatted_metavalue); 175 } 176 177 push(@{$self->{'list'}->{$full_doc_OID}},$formatted_metavalue); 178 push(@{$self->{'listmetavalue'}->{$full_doc_OID}} ,$metavalue); 179 180 last if ($self->{'onlyfirst'}); 181 } 182 } 183 my $date = $doc_obj->get_metadata_element($thissection,"Date"); 184 $self->{'reclassify'}->{$full_doc_OID} = [$doc_obj,$date]; 123 185 } 124 186 } … … 133 195 134 196 # find out how often each metavalue occurs 135 map { $mtfreq{$self->{'listmetavalue'}->{$_}}++; } @$classlist_ref; 197 map 198 { 199 my $mv; 200 foreach $mv (@{$self->{'listmetavalue'}->{$_}} ) 201 { 202 $mtfreq{$mv}++; 203 } 204 } @$classlist_ref; 136 205 137 206 # use this information to split the list: single metavalue/repeated value 138 207 map 139 208 { 140 my $metavalue = $self->{'listmetavalue'}->{$_}; 141 print "meta value = $metavalue; count = $mtfreq{$metavalue}\n"; 142 143 if ($mtfreq{$metavalue}>=$self->{'mingroup'}) 144 { 145 push(@multiple_classlist,$_); 146 } 147 else 148 { 149 push(@single_classlist,$_); 150 $self->{'reclassifylist'}->{$_} = $metavalue; 209 my $i = 1; 210 my $metavalue; 211 foreach $metavalue (@{$self->{'listmetavalue'}->{$_}}) 212 { 213 if ($mtfreq{$metavalue} >= $self->{'mingroup'}) 214 { 215 push(@multiple_classlist,[$_,$i,$metavalue]); 216 } 217 else 218 { 219 push(@single_classlist,[$_,$metavalue]); 220 $metavalue =~ tr/[A-Z]/[a-z]/; 221 $self->{'reclassifylist'}->{"Metavalue_$i.$_"} = $metavalue; 222 } 223 $i++; 151 224 } 152 225 } @$classlist_ref; … … 157 230 $self->{'classifiers'} = {}; 158 231 159 my $listname 160 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/List.pm"); 161 if (-e $listname) { require $listname; } 162 else 163 { 164 die "TCCList ERROR - couldn't find classifier \"$listname\"\n"; 165 } 166 232 my $pm; 233 foreach $pm ("List", "SectionList") 234 { 235 my $listname 236 = &util::filename_cat($ENV{'GSDLHOME'},"perllib/classify/$pm.pm"); 237 if (-e $listname) { require $listname; } 238 else 239 { 240 die "AZCompactList ERROR - couldn't find classifier \"$listname\"\n"; 241 } 242 } 243 244 # Create classifiers objects for each entry >= mingroup 167 245 my $metavalue; 168 246 foreach $metavalue (keys %mtfreq) 169 247 { 170 if ($mtfreq{$metavalue} >=$self->{'mingroup'})248 if ($mtfreq{$metavalue} >= $self->{'mingroup'}) 171 249 { 172 250 my $listclassobj; 251 my $doclevel = $self->{'doclevel'}; 173 252 my $metaname = $self->{'metaname'}; 174 175 eval ("\$listclassobj = new List(\"metadata=$metaname\", \"title=\$metavalue\", \"sort=Date\")"); 253 my @metaname_list = split('/',$metaname); 254 $metaname = shift(@metaname_list); 255 if (@metaname_list==0) 256 { 257 if ($doclevel =~ m/^top(level)?/i) 258 { 259 my $args = "\"metadata=$metaname\""; 260 $args .= ", \"title=\$metavalue\""; 261 $args .= ", \"sort=Date\""; 262 eval ("\$listclassobj = new List($args)"); 263 } 264 else 265 { 266 my $args = "\"metadata=$metaname\""; 267 $args .= ", \"title=\$metavalue\""; 268 $args .= ", \"sort=Date\""; 269 eval ("\$listclassobj = new SectionList($args)"); 270 } 271 } 272 else 273 { 274 $metaname = join('/',@metaname_list); 275 276 my $args = "\"metadata=$metaname\""; 277 $args .= ", \"title=\$metavalue\""; 278 $args .= ", \"doclevel=\$doclevel\""; 279 $args .= ", \"recopt\""; 280 281 eval ("\$listclassobj = new AZCompactList($args)"); 282 } 176 283 die "$@" if $@; 177 284 … … 181 288 { 182 289 my $formatted_node = $metavalue; 183 if ($self->{'metaname'} eq 'Creator')290 if ($self->{'metaname'} =~ m/^Creator(:.*)?$/) 184 291 { 185 292 &sorttools::format_string_name_english(\$formatted_node); … … 206 313 my ($self,$multiple_cl_ref) = @_; 207 314 208 my $metaname = $self->{'metaname'}; 209 210 my $doc_OID; 211 foreach $doc_OID (@$multiple_cl_ref) 212 { 315 # Entries in the current classify list that are "book nodes" 316 # should be recursively classified. 317 #-- 318 foreach $dm_pair (@$multiple_cl_ref) 319 { 320 my ($doc_OID,$mdoffset,$metavalue) = @$dm_pair; 213 321 my $listclassobj; 214 my $metavalue = $self->{'listmetavalue'}->{$doc_OID}; 215 322 216 323 # find metavalue in list of sub-classifiers 217 324 my $found = 0; … … 219 326 foreach $node_name (keys %{$self->{'classifiers'}}) 220 327 { 221 if ($metavalue =~ /^$node_name$/i) 328 $resafe_node_name = $node_name; 329 $resafe_node_name =~ s/(\(|\)|\[|\]|\{|\}|\^|\$|\.|\+|\*|\?|\|)/\\$1/g; 330 if ($metavalue =~ m/^$resafe_node_name$/i) 222 331 { 223 332 my ($doc_obj,$date) = @{$self->{'reclassify'}->{$doc_OID}}; 224 333 225 $self->{'classifiers'}->{$node_name}->{'classifyobj'} 226 ->classify($doc_obj, $date); 227 334 ## date appears to not be used in classifier call #### 335 336 if ($doc_OID =~ m/^.*\.(\d+)$/) 337 { 338 $self->{'classifiers'}->{$node_name}->{'classifyobj'} 339 ->classify($doc_obj, "Section=$1"); 340 } 341 else 342 { 343 $self->{'classifiers'}->{$node_name}->{'classifyobj'} 344 ->classify($doc_obj); 345 } 346 228 347 $found = 1; 229 348 last; 230 349 } 231 350 } 232 351 233 352 if (!$found) 234 353 { 235 354 print STDERR "Warning: AZCompactList::reclassify "; 236 print STDERR "could not find sub-node for $metavalue \n";355 print STDERR "could not find sub-node for $metavalue with doc_OID $doc_OID\n"; 237 356 } 238 357 } … … 250 369 my $classifyinfo 251 370 = $self->{'classifiers'}->{$node_name}->{'classifyobj'} 252 ->get_classify_info( 1);371 ->get_classify_info(); 253 372 $self->{'classifiers'}->{$node_name}->{'classifyinfo'} 254 373 = $classifyinfo; … … 259 378 260 379 380 sub alpha_numeric_cmp 381 { 382 my ($self,$a,$b) = @_; 383 384 my $title_a = $self->{'reclassifylist'}->{$a}; 385 my $title_b = $self->{'reclassifylist'}->{$b}; 386 387 if ($title_a =~ m/^(\d+(\.\d+)?)/) 388 { 389 my $val_a = $1; 390 if ($title_b =~ m/^(\d+(\.\d+)?)/) 391 { 392 my $val_b = $1; 393 if ($val_a != $val_b) 394 { 395 return ($val_a <=> $val_b); 396 } 397 } 398 } 399 400 return ($title_a cmp $title_b); 401 } 402 261 403 sub get_classify_info { 262 404 my $self = shift (@_); 263 405 264 my @classlist = sort {$self->{'list'}->{$a} cmp $self->{'list'}->{$b};} keys %{$self->{'list'}};406 my @classlist =keys %{$self->{'list'}}; # list all doc oids 265 407 266 408 my ($single_cl_ref,$multiple_cl_ref) = $self->reinit(\@classlist); … … 268 410 $self->get_reclassify_info(); 269 411 412 413 # my @reclassified_classlist 414 # = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}}; 415 416 # alpha_numeric_cmp is slower but handles numbers better ... 270 417 my @reclassified_classlist 271 = sort {$self->{'reclassifylist'}->{$a} cmp $self->{'reclassifylist'}->{$b};} keys %{$self->{'reclassifylist'}}; 418 = sort { $self->alpha_numeric_cmp($a,$b) } keys %{$self->{'reclassifylist'}}; 419 272 420 273 421 return $self->splitlist (\@reclassified_classlist); 274 422 } 275 423 276 # splitlist takes an ordered list of classifications (@$classlistref) and splits it 277 # up into alphabetical sub-sections. 424 sub get_entry { 425 my $self = shift (@_); 426 my ($title, $childtype, $metaname, $thistype) = @_; 427 428 # organise into classification structure 429 my %classifyinfo = ('childtype'=>$childtype, 430 'Title'=>$title, 431 'contains'=>[], 432 'mdtype'=>$metaname); 433 434 $classifyinfo{'thistype'} = $thistype 435 if defined $thistype && $thistype =~ /\w/; 436 437 return \%classifyinfo; 438 } 439 440 441 442 # splitlist takes an ordered list of classifications (@$classlistref) and 443 # splits it up into alphabetical sub-sections. 278 444 sub splitlist { 279 445 my $self = shift (@_); … … 282 448 283 449 # top level 450 my @metanames = split("/",$self->{'metaname'}); 451 my $metaname = shift(@metanames); 452 284 453 my $childtype = "HList"; 285 if (scalar (@$classlistref) <= 39) {$childtype = "VList";} 286 my $classifyinfo = $self->get_entry ($self->{'title'}, $childtype, "Invisible"); 287 288 # don't need to do any splitting if there are less than 39 (max + min-1) classifications 289 if ((scalar @$classlistref) <= 39) { 454 $childtype = "VList" if (scalar (@$classlistref) <= $self->{'minnesting'}); 455 456 my $classifyinfo; 457 if (!defined($self->{'recopt'})) 458 { 459 my $title = $metaname; 460 $classifyinfo 461 = $self->get_entry ($metaname, $childtype, $metaname, "Invisible"); 462 } 463 else 464 { 465 my $title = $self->{'title'}; 466 $classifyinfo 467 = $self->get_entry ($title, $childtype, $metaname, "VList"); 468 } 469 470 # don't need to do any splitting if there are less than 'minnesting' classifications 471 if ((scalar @$classlistref) <= $self->{'minnesting'}) { 290 472 foreach $subOID (@$classlistref) { 291 473 if ($subOID =~ /^CLASSIFY\.(.*)$/ 292 474 && defined $self->{'classifiers'}->{$1}) 293 475 { 476 ### print STDERR "*** subOID = $subOID\n"; 477 294 478 push (@{$classifyinfo->{'contains'}}, 295 479 $self->{'classifiers'}->{$1}->{'classifyinfo'}); … … 297 481 else 298 482 { 299 push (@{$classifyinfo->{'contains'}}, {'OID'=>$subOID}); 483 $subOID =~ s/^Metavalue_(\d+)\.//; 484 my $metaname_offset = $1 -1; 485 my $oid_rec = {'OID'=>$subOID, 'offset'=>$metaname_offset}; 486 push (@{$classifyinfo->{'contains'}}, $oid_rec); 300 487 } 301 488 } … … 307 494 my $title = $self->{'reclassifylist'}->{$classification}; 308 495 $title =~ s/&(.){2,4};//g; # remove any HTML special chars 309 $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation 496 ### $title =~ s/^\s+//g; # remove a leading spaces 497 ### $title =~ s/^_+//g; # remove a leading underscores 498 $title =~ s/^\W+//g; 499 ### $title =~ s/^(\'|\`|\"|\:|\()//g; # remove any opening punctutation 310 500 $title =~ s/^(.).*$/$1/; 311 501 $title =~ tr/[a-z]/[A-Z]/; … … 334 524 foreach $subclass (@tmparr) 335 525 { 336 my $tempclassify = $self->get_entry ($subclass, "VList");526 my $tempclassify = $self->get_entry($subclass, "VList", $metaname); 337 527 foreach $subsubOID (@{$classhash->{$subclass}}) 338 528 { … … 345 535 else 346 536 { 347 push (@{$tempclassify->{'contains'}}, {'OID'=>$subsubOID}); 537 $subsubOID =~ s/^Metavalue_(\d+)\.//; 538 my $metaname_offset = $1 -1; 539 my $oid_rec = {'OID'=>$subsubOID, 'offset'=>$metaname_offset}; 540 push (@{$tempclassify->{'contains'}}, $oid_rec); 348 541 } 349 542 } … … 354 547 } 355 548 549 sub compactlist { 550 my $self = shift (@_); 551 my ($classhashref) = @_; 552 my $compactedhash = {}; 553 my @currentOIDs = (); 554 my $currentfirstletter = ""; 555 my $currentlastletter = ""; 556 my $lastkey = ""; 557 558 # minimum and maximum documents to be displayed per page. 559 # the actual maximum will be max + (min-1). 560 # the smallest sub-section is a single letter at present 561 # so in this case there may be many times max documents 562 # displayed on a page. 563 my $min = $self->{'mincompact'}; 564 my $max = $self->{'maxcompact'}; 565 566 foreach $subsection (sort keys %$classhashref) { 567 if ($subsection eq '0-9') { 568 @{$compactedhash->{$subsection}} = @{$classhashref->{$subsection}}; 569 next; 570 } 571 $currentfirstletter = $subsection if $currentfirstletter eq ""; 572 if ((scalar (@currentOIDs) < $min) || 573 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) { 574 push (@currentOIDs, @{$classhashref->{$subsection}}); 575 $currentlastletter = $subsection; 576 } else { 577 578 if ($currentfirstletter eq $currentlastletter) { 579 @{$compactedhash->{$currentfirstletter}} = @currentOIDs; 580 $lastkey = $currentfirstletter; 581 } else { 582 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs; 583 $lastkey = "$currentfirstletter-$currentlastletter"; 584 } 585 if (scalar (@{$classhashref->{$subsection}}) >= $max) { 586 $compactedhash->{$subsection} = $classhashref->{$subsection}; 587 @currentOIDs = (); 588 $currentfirstletter = ""; 589 } else { 590 @currentOIDs = @{$classhashref->{$subsection}}; 591 $currentfirstletter = $subsection; 592 $currentlastletter = $subsection; 593 } 594 } 595 } 596 597 # add final OIDs to last sub-classification if there aren't many otherwise 598 # add final sub-classification 599 if (scalar (@currentOIDs) < $min) { 600 my ($newkey) = $lastkey =~ /^(.)/; 601 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs); 602 delete $compactedhash->{$lastkey}; 603 @{$compactedhash->{"$newkey-$currentlastletter"}} = @currentOIDs; 604 } else { 605 if ($currentfirstletter eq $currentlastletter) { 606 @{$compactedhash->{$currentfirstletter}} = @currentOIDs; 607 } else { 608 @{$compactedhash->{"$currentfirstletter-$currentlastletter"}} = @currentOIDs; 609 } 610 } 611 612 return $compactedhash; 613 } 614 356 615 1; 616 617
Note:
See TracChangeset
for help on using the changeset viewer.