Changeset 36911
- Timestamp:
- 2022-11-22T11:48:29+13:00 (13 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/classify/List.pm
r33899 r36911 177 177 { 'name' => "partition_name_length", 178 178 'desc' => "{List.partition_name_length}", 179 'type' => "string" }, 180 179 'type' => "string" }, 180 181 { 'name' => "max_partition_name_length", 182 'desc' => "{List.max_partition_name_length}", 183 'type' => "string", 184 'deft' => "3" }, 185 181 186 {'name' => "partition_sort_mode_within_level", 182 187 'desc' => "{List.partition_sort_mode_within_level}", … … 224 229 { 'name' => "reverse_sort_leaf_nodes", 225 230 'desc' => "{List.reverse_sort_leaf_nodes}", 226 'type' => "flag"}, 227 231 'type' => "flag"}, 232 233 { 'name' => "sort_leaf_nodes_removeprefix", 234 'desc' => "{List.sort_leaf_removeprefix}", 235 'type' => "regexp" }, 236 237 { 'name' => "sort_leaf_nodes_removesuffix", 238 'desc' => "{List.sort_leaf_nodes_removesuffix}", 239 'type' => "regexp" }, 240 228 241 { 'name' => "sort_using_unicode_collation", 229 242 'desc' => "{List.metadata_sort.unicode} {List.sort_using_unicode_collation}", … … 347 360 foreach my $metadata_group (@metadata_groups) { 348 361 my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist); 362 349 363 if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") { 350 364 # Remove the other ' at the beginning and the end if there is any … … 364 378 foreach my $metadata_group (@metadata_groups) { 365 379 my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist); 380 366 381 if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") { 367 382 $removesuffix_expr_within_levelelem =~ s/^'//; … … 479 494 return unless defined $meta; 480 495 if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) { 481 print STDERR "doc $doc_obj doesn't pass filtering requirement \n" if ($self->{'verbosity'} > 3);496 print STDERR "doc $doc_obj doesn't pass filtering requirement. Not classifying.\n" if ($self->{'verbosity'} > 3); 482 497 return; 483 498 } … … 530 545 $classify_section = 1; 531 546 last; 547 532 548 } 533 549 } … … 680 696 } 681 697 elsif ($partition_type_within_level =~ /^approximate_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 682 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'} );698 $self->split_approximate_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'}); 683 699 684 700 } 685 701 elsif ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) { 686 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'} );702 $self->split_constant_size(\@metadata_groups, $classifier_node, $partition_size_within_level, $partition_sort_mode_within_level, $bookshelf_type_within_level, \%metadata_value_to_OIDs_hash, $self->{'partition_name_length'}, $self->{'max_partition_name_length'}); 687 703 } 688 704 … … 732 748 # Generate hlists of a certain size 733 749 734 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);750 $self->split_constant_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level,$numeric_partition_name_length_within_level, 1); 735 751 } elsif ($numeric_partition_type_within_level eq "approximate_size" && scalar(keys %$numeric_metadata_value_to_OIDs_hash_ref) > $numeric_partition_size_within_level) { 736 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, 1);752 $self->split_approximate_size(\@metadata_groups, $classifier_node, $numeric_partition_size_within_level, $numeric_partition_sort_mode_within_level, $bookshelf_type_within_level, $numeric_metadata_value_to_OIDs_hash_ref, $numeric_partition_name_length_within_level, $numeric_partition_name_length_within_level, 1); 737 753 } 738 754 # Otherwise just add all the values to a VList … … 748 764 } 749 765 766 750 767 sub split_approximate_size 768 { 769 770 my $self = shift(@_); 771 my @metadata_groups = @{shift(@_)}; 772 my $classifier_node = shift(@_); 773 my $partition_size = shift(@_); 774 my $sort_mode = shift(@_); 775 my $bookshelf_type = shift(@_); 776 my $metadata_value_to_OIDs_hash_ref = shift(@_); 777 my $partition_name_length = shift(@_); 778 my $max_partition_name_length = shift(@_); 779 my $is_numeric = shift(@_); 780 781 my $tolerance = 10; # should this be an option?? 782 783 # Generate hlist based on the first letter of the metadata value (like per_letter), or based on 784 # numbers, but also with restriction on the partition size 785 # If a partition has fewer items than specified by the "partition_size_within_level", then group them together if possible 786 # If a partition has more items than specified, split into several hlists. 787 # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise) 788 789 my @sortedmetadata_values = $self->sort_metadata_values_array($sort_mode, keys(%$metadata_value_to_OIDs_hash_ref)); 790 791 # Separate values by their first letter, each form a bucket, like the per_letter partition type 792 my $last_partition = $self->generate_partition_name($sortedmetadata_values[0], $partition_name_length, $is_numeric); 793 794 my @partition_buckets = (); 795 my @metadata_values_in_bucket = (); 796 my $num_items_in_bucket = 0; 797 798 foreach my $metadata_value (@sortedmetadata_values) { 799 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 800 if ($metadata_valuepartition ne $last_partition) { 801 my @temp_array = @metadata_values_in_bucket; 802 # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values 803 my %partition_info = (); 804 $partition_info{'metadata_values'} = \@temp_array; 805 $partition_info{'size'} = $num_items_in_bucket; 806 $partition_info{'name'} = $last_partition; 807 push (@partition_buckets, \%partition_info); 808 809 @metadata_values_in_bucket = ($metadata_value); 810 $num_items_in_bucket = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : scalar(@metadata_values_in_bucket)); 811 $last_partition = $metadata_valuepartition; 812 } else { 813 $num_items_in_bucket += ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); #scalar(@metadata_values_in_bucket); 814 push (@metadata_values_in_bucket, $metadata_value); 815 } 816 } 817 # Last one 818 my %partition_info = (); 819 $partition_info{'metadata_values'} = \@metadata_values_in_bucket; 820 $partition_info{'size'} = $num_items_in_bucket; 821 $partition_info{'name'} = $last_partition; 822 823 push (@partition_buckets, \%partition_info); 824 825 # now go through the array of A, B, C buckets, and merge small buckets 826 my @new_partition_buckets = (); 827 for (my $i = 0; $i < scalar(@partition_buckets) - 1; $i++) { 828 829 my $this_bucket = $partition_buckets[$i]; 830 my $next_bucket = $partition_buckets[$i+1]; 831 832 my $items_in_partition = $this_bucket->{'size'}; 833 834 if ($items_in_partition < $partition_size ) { 835 my $items_in_next_partition = $next_bucket->{'size'}; 836 if ($items_in_partition + $items_in_next_partition <= $partition_size+$tolerance ) { 837 # merge this bucket into the next bucket 838 foreach my $metadata_value_to_merge (@{$this_bucket->{'metadata_values'}}) { 839 push(@{$next_bucket->{'metadata_values'}}, $metadata_value_to_merge); 840 } 841 $next_bucket->{'size'} += $items_in_partition; 842 843 } else { 844 # remember this bucket 845 push (@new_partition_buckets, $this_bucket); 846 } 847 } else { 848 # remember this bucket 849 push (@new_partition_buckets, $this_bucket); 850 } 851 } 852 # add in the last bucket 853 my $last_bucket = $partition_buckets[scalar(@partition_buckets) - 1]; 854 push (@new_partition_buckets, $last_bucket); 855 856 # Add partitions to the main list, but divide big bucket into several 857 my $last_partition_end = ""; 858 my $partition_start = ""; 859 my $partition_end = ""; 860 my $partition_name = ""; 861 foreach my $partition (@new_partition_buckets) { 862 my @metadata_values = $self->sort_metadata_values_array($sort_mode, @{$partition->{'metadata_values'}}); 863 my $items_in_partition = $partition->{'size'}; 864 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric); 865 866 if ($items_in_partition <= $partition_size+$tolerance) { 867 # we can just add the partition as is 868 my %metadata_values_to_OIDs_subhashes = (); 869 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 870 my $metadata_value = $metadata_values[$i]; 871 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 872 } 873 my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1]; 874 $partition_end = $self->generate_partition_end($last_metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric); 875 $partition_name = $partition_start; 876 if ($partition_end ne $partition_start) { 877 $partition_name = $partition_name . "-" . $partition_end; 878 } 879 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 880 $last_partition_end = $partition_end; 881 } else { 882 # we have too many items, need to split the partition 883 my $num_splits = int($items_in_partition/$partition_size); 884 $num_splits++ if (($items_in_partition - $partition_size*$num_splits) > $tolerance); 885 my $this_partition_size = $items_in_partition/$num_splits; 886 # regenerate-the start, noticing that we are a split 887 $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1); 888 my $items_done = 0; 889 my $prev_done = 0; 890 my %metadata_values_to_OIDs_subhashes = (); 891 for (my $i = 0; $i < scalar(@metadata_values); $i++) { 892 my $metadata_value = $metadata_values[$i]; 893 # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values 894 my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1; 895 896 if ($items_done + $items_for_this_md_value > $this_partition_size && $items_done != 0 && ($items_in_partition - ($prev_done+$items_done) >= $tolerance)) { 897 # Save the stored items into a partition 898 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 899 $last_partition_end = $partition_end; 900 $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric, 1); 901 $prev_done += $items_done; 902 $items_done = 0; 903 %metadata_values_to_OIDs_subhashes = (); 904 } 905 906 # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions 907 if ($bookshelf_type eq "never" && $items_for_this_md_value > $this_partition_size) { 908 # if this is the case, we would have already stored what we have done in the previous step. 909 my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $partition_name_length, $max_partition_name_length, $is_numeric); 910 # Get the number of partitions needed for this value 911 my $num_splits = int($items_for_this_md_value / $this_partition_size); 912 $num_splits++ if ($items_for_this_md_value / $this_partition_size > $num_splits); 913 my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}; 914 for (my $i = 0; $i < $num_splits; $i++) { 915 my %OIDs_subhashes_for_this_value = (); 916 my @OIDs_for_this_partition = (); 917 for (my $d = $i * $this_partition_size; $d < (($i+1) * $this_partition_size > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size); $d++) { 918 push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]); 919 } 920 921 # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values 922 if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $this_partition_size) { 923 $partition_start = $partitionname_for_this_value; 924 $partition_name = $partition_start; 925 $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition; 926 $items_done += scalar(@OIDs_for_this_partition); 927 $last_partition_end = $partitionname_for_this_value 928 } else { 929 930 # Add an HList for this bucket 931 $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition; 932 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value); 933 $last_partition_end = $partitionname_for_this_value; 934 } 935 } 936 } else { 937 $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash_ref->{$metadata_value}; 938 $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1; 939 $partition_end = $self->generate_partition_end($metadata_value, $partition_start, $partition_name_length, $max_partition_name_length, $is_numeric); 940 $partition_name = $partition_start; 941 if ($partition_end ne $partition_start) { 942 $partition_name = $partition_name . "-" . $partition_end; 943 } 944 945 } 946 947 # The last partition 948 if($i == scalar(@metadata_values) - 1 && $items_done >0) { 949 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partition_name, \%metadata_values_to_OIDs_subhashes); 950 } 951 952 } 953 } # end if items in partition > partition size 954 955 } 956 957 # The partitions are stored in an HList 958 $classifier_node->{'childtype'} = "HList"; 959 960 } 961 sub split_approximate_size_ORIG 751 962 { 752 963 … … 934 1145 } 935 1146 1147 936 1148 sub split_constant_size 937 1149 { … … 944 1156 my $metadata_value_to_OIDs_hash_ref = shift(@_); 945 1157 my $partition_name_length = shift(@_); 1158 my $max_partition_name_length = shift(@_); 946 1159 my $is_numeric = shift(@_); 947 1160 … … 955 1168 if ($items_in_partition == 0) { 956 1169 # a new partition, set the name 957 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $ is_numeric);1170 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $max_partition_name_length, $is_numeric); 958 1171 } 959 1172 my $numitems_for_this_value = ($bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash_ref->{$metadata_value}}) : 1); … … 972 1185 973 1186 # now we have filled up the partition 974 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $ is_numeric);1187 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric); 975 1188 my $partitionname = $partitionstart; 976 1189 if ($partitionend ne $partitionstart) { … … 987 1200 my @slice = splice(@OIDs, 0, $partition_size); 988 1201 $metadata_value_to_OIDs_subhash{$metadata_value} = \@slice; 989 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $ is_numeric);990 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $ is_numeric);1202 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $max_partition_name_length, $is_numeric); 1203 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric); 991 1204 my $partitionname = $partitionstart; 992 1205 if ($partitionend ne $partitionstart) { … … 1002 1215 $metadata_value_to_OIDs_subhash{$metadata_value} = \@OIDs; 1003 1216 $items_in_partition = scalar(@OIDs); 1004 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $ is_numeric);1217 $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $partition_name_length, $max_partition_name_length, $is_numeric); 1005 1218 } 1006 1219 … … 1010 1223 if ($items_in_partition == $partition_size) { 1011 1224 # its the end of a partition 1012 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $ is_numeric);1225 my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric); 1013 1226 my $partitionname = $partitionstart; 1014 1227 if ($partitionend ne $partitionstart) { … … 1025 1238 if ($items_in_partition > 0) { 1026 1239 # we have to add the last partition 1027 my $partitionend = $self->generate_partition_end(@sortedmetadata_values[@sortedmetadata_values-1], $partitionstart, $partition_name_length, $ is_numeric);1240 my $partitionend = $self->generate_partition_end(@sortedmetadata_values[@sortedmetadata_values-1], $partitionstart, $partition_name_length, $max_partition_name_length, $is_numeric); 1028 1241 my $partitionname = $partitionstart; 1029 1242 if ($partitionend ne $partitionstart) { … … 1061 1274 my $lastpartition = $self->generate_partition_name($sortedmetadata_values[0], $numeric_partition_length, $is_numeric); 1062 1275 foreach my $metadata_value (@sortedmetadata_values) { 1063 1064 1276 my $metadata_valuepartition = $self->generate_partition_name($metadata_value, $numeric_partition_length, $is_numeric); 1065 1066 1277 # Is this the start of a new partition? 1067 1278 if ($metadata_valuepartition ne $lastpartition) { 1279 #print STDERR "new partition, old = $lastpartition, new=$metadata_valuepartition\n"; 1068 1280 $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash); 1069 1281 %metadata_value_to_OIDs_subhash = (); … … 1106 1318 my $lastpartitionend = shift(@_); 1107 1319 my $partition_name_length = shift(@_); 1320 my $max_partition_name_length = shift(@_); 1108 1321 my $is_numeric = shift(@_); 1109 1322 my $is_split = shift(@_); 1323 1324 # print STDERR "generate-partitoin start, $metadata_value, $lastpartitionend, $max_partition_name_length\n"; 1110 1325 if ($is_numeric) { 1111 1326 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); 1112 1327 } 1113 1328 1329 $is_split = 0 unless defined $is_split; 1330 # print STDERR "is split = $is_split\n"; 1114 1331 if ($partition_name_length) { 1332 # print STDERR "partiton anme lenght = $partition_name_length, just using that length\n"; 1115 1333 return substr($metadata_value, 0, $partition_name_length); 1116 1334 } 1117 1335 1118 my $partitionstart = substr($metadata_value, 0, 1); 1119 if ($partitionstart le $lastpartitionend) { 1120 $partitionstart = substr($metadata_value, 0, 2); 1121 # Give up after three characters 1122 if ($partitionstart le $lastpartitionend) { 1123 $partitionstart = substr($metadata_value, 0, 3); 1124 } 1336 my $variable_partition_name_length = (1+$is_split <= $max_partition_name_length ? 1+$is_split : $max_partition_name_length); 1337 my $partitionstart = substr($metadata_value, 0, $variable_partition_name_length); 1338 # print STDERR "new start = $partitionstart, $variable_partition_name_length\n"; 1339 while(($partitionstart le $lastpartitionend) && (($variable_partition_name_length += 1) <= $max_partition_name_length)) { 1340 # print STDERR "NEW###### $variable_partition_name_length\n"; 1341 $partitionstart = substr($metadata_value, 0, $variable_partition_name_length); 1125 1342 } 1126 1343 … … 1135 1352 my $partitionstart = shift(@_); 1136 1353 my $partition_name_length = shift(@_); 1354 my $max_partition_name_length = shift(@_); 1137 1355 my $is_numeric = shift(@_); 1138 1356 #print STDERR "geenrate end, $metadata_value: $partitionstart: $partition_name_length: $max_partition_name_length\n"; 1139 1357 if ($is_numeric) { 1140 1358 return $self->generate_partition_name($metadata_value, $partition_name_length, $is_numeric); … … 1145 1363 1146 1364 my $partitionend = substr($metadata_value, 0, length($partitionstart)); 1147 if ($partitionend gt $partitionstart) {1148 $partitionend = substr($metadata_value, 0, 1);1149 if ($partitionend le $partitionstart) {1150 $partitionend = substr($metadata_value, 0, 2);1151 # Give up after three characters1152 if ($partitionend le $partitionstart) {1153 $partitionend = substr($metadata_value, 0, 3);1154 }1155 }1156 }1365 #if ($partitionend gt $partitionstart) { 1366 # $partitionend = substr($metadata_value, 0, 1); 1367 # if ($partitionend le $partitionstart) { 1368 # $partitionend = substr($metadata_value, 0, 2); 1369 # # Give up after three characters 1370 # if ($partitionend le $partitionstart) { 1371 # $partitionend = substr($metadata_value, 0, 3); 1372 # } 1373 # } 1374 # } 1157 1375 1158 1376 return $partitionend;
Note:
See TracChangeset
for help on using the changeset viewer.