Changeset 18619 for gsdl


Ignore:
Timestamp:
03/04/09 16:49:04 (12 years ago)
Author:
anna
Message:

Added several options: 1) more bookshelf types (a.never create bookshelves, b.only for duplicated items and c.always create bookshelf); 2) remove_prefix and remove_suffix; 3) per_letter_fixed_size partition type, which differs from the constant_size type in two ways: a.prohibit combining numeric metadata values with non-numeric ones; b.partition size is calculated differently depending on the bookshelf_type, ie. if bookshelf_type is 'never', count documents, otherwise count distinct metadata values; c.if a single letter contains many items and has been split into several partitions, the tail (ie. the last partition) won't be combined with next letter. More testing still needed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/classify/List.pm

    r18572 r18619  
    4848    'desc' => "{List.level_partition.per_letter}" },
    4949      { 'name' => "constant_size",
    50     'desc' => "{List.level_partition.constant_size}" },
     50    'desc' => "{List.level_partition.constant_size}" },     
     51      { 'name' => "per_letter_fixed_size",
     52    'desc' => "{List.level_partition.per_letter_fixed_size}"},
    5153      { 'name' => "none",
    5254    'desc' => "{List.level_partition.none}" } ];
    53      
     55
     56my $bookshelf_type_list =
     57    [ { 'name' => "always",
     58    'desc' => "{List.bookshelf_type.always}" },
     59      { 'name' => "duplicate_only",
     60    'desc' => "{List.bookshelf_type.duplicate_only}" },
     61      { 'name' => "never",
     62    'desc' => "{List.bookshelf_type.never}" } ];     
    5463
    5564my $arguments =
     
    6069
    6170      # The interesting options
    62       { 'name' => "always_bookshelf_last_level",
    63     'desc' => "{GenericList.always_bookshelf_last_level}",
    64     'type' => "flag" },
     71      { 'name' => "bookshelf_type",
     72    'desc' => "{List.bookshelf_types}",
     73    'type' => "enum",
     74    'list' => $bookshelf_type_list,
     75    'deft' => "duplicate_only" },
    6576      { 'name' => "classify_sections",
    6677    'desc' => "{GenericList.classify_sections}",
     
    6879      { 'name' => "partition_type_within_level",
    6980    'desc' => "{GenericList.partition_type_within_level}",
    70     'type' => "enum",
    71     'list' => $partition_type_list,
     81    'type' => "string",
    7282    'deft' => "none" },
    7383      { 'name' => "partition_size_within_level",
     
    8898      { 'name' => "use_hlist_for",
    8999    'desc' => "{GenericList.use_hlist_for}",
    90     'type' => "string" } ];
     100    'type' => "string" },
     101      { 'name' => "removeprefix",
     102    'desc' => "{BasClas.removeprefix}",
     103    'type' => "regexp" },
     104      { 'name' => "removesuffix",
     105    'desc' => "{BasClas.removesuffix}",
     106    'type' => "regexp" } ];
    91107
    92108my $options = { 'name'     => "List",
     
    127143    }
    128144
    129     # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
     145    # Whether to group items into a bookshelf, (must be 'always' for all metadata fields except the last)
    130146    foreach my $metadata_group (@metadata_groups) {
    131     $self->{$metadata_group . ".always_bookshelf"} = "t";
    132     }
    133     if (!$self->{'always_bookshelf_last_level'}) {
    134     # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
    135     my $last_metadata_group = $metadata_groups[$#metadata_groups];
    136     $self->{$last_metadata_group . ".always_bookshelf"} = "f";
    137     }
    138 
     147    $self->{$metadata_group . ".bookshelf_type"} = "always";
     148    }   
     149    my $last_metadata_group = $metadata_groups[$#metadata_groups];
     150    # Default: duplicate_only, ie. leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
     151    $self->{$last_metadata_group . ".bookshelf_type"} = $self->{'bookshelf_type'};
     152       
    139153    # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
    140154    foreach my $metadata_group (@metadata_groups) {
     
    145159    }
    146160
    147     # How the items are grouped into partitions (default: no partition)
     161    # How the items are grouped into partitions (default: no partition)
     162    # for each level (metadata group), separated by '/'
    148163    if (!$self->{"partition_type_within_level"}) {
    149     $self->{"partition_type_within_level"} = "none";
     164    foreach my $metadata_group (@metadata_groups) {
     165        $self->{$metadata_group . ".partition_type_within_level"} = "none";
     166    }
     167    } else {
     168    my @partition_type_within_levellist = split(/\//, $self->{'partition_type_within_level'}); 
     169    foreach my $metadata_group (@metadata_groups) {
     170        my $partition_type_within_levelelem = shift(@partition_type_within_levellist);
     171        if (defined($partition_type_within_levelelem)) {
     172        $self->{$metadata_group . ".partition_type_within_level"} = $partition_type_within_levelelem;
     173        }
     174        else {
     175        $self->{$metadata_group . ".partition_type_within_level"} = $self->{$metadata_groups[0] . ".partition_type_within_level"};
     176        }
     177    }
    150178    }
    151179
     
    168196        else {
    169197        $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
     198        }
     199    }
     200    }
     201
     202    # The removeprefix and removesuffix expressions
     203    if ($self->{'removeprefix'}) {
     204    # If there are more than one expressions, use '' to quote each experession and '/' to separate
     205    my @removeprefix_exprs_within_levellist = split(/'\/'/, $self->{'removeprefix'});
     206
     207    foreach my $metadata_group (@metadata_groups) {
     208        my $removeprefix_expr_within_levelelem = shift(@removeprefix_exprs_within_levellist);
     209        if (defined($removeprefix_expr_within_levelelem) && $removeprefix_expr_within_levelelem ne "") {
     210        # Remove the other ' at the beginning and the end if there is any
     211        $removeprefix_expr_within_levelelem =~ s/^'//;
     212        $removeprefix_expr_within_levelelem =~ s/'$//;
     213        # Remove the extra ^ at the beginning
     214        $removeprefix_expr_within_levelelem =~ s/^\^//;
     215        $self->{$metadata_group . ".remove_prefix_expr"} = $removeprefix_expr_within_levelelem;
     216        } else {
     217        $self->{$metadata_group . ".remove_prefix_expr"} = $self->{$metadata_groups[0] . ".remove_prefix_expr"};
     218        }
     219    }
     220    }
     221    if ($self->{'removesuffix'}) { 
     222    my @removesuffix_exprs_within_levellist = split(/'\/'/, $self->{'removesuffix'});
     223
     224    foreach my $metadata_group (@metadata_groups) {
     225        my $removesuffix_expr_within_levelelem = shift(@removesuffix_exprs_within_levellist);
     226        if (defined($removesuffix_expr_within_levelelem) && $removesuffix_expr_within_levelelem ne "") {
     227        $removesuffix_expr_within_levelelem =~ s/^'//;
     228        $removesuffix_expr_within_levelelem =~ s/'$//;
     229        # Remove the extra $ at the end
     230        $removesuffix_expr_within_levelelem =~ s/\$$//;
     231        $self->{$metadata_group . ".remove_suffix_expr"} = $removesuffix_expr_within_levelelem;
     232        } else {
     233        $self->{$metadata_group . ".remove_suffix_expr"} = $self->{$metadata_groups[0] . ".remove_suffix_expr"};
    170234        }
    171235    }
     
    233297    my $first_metadata_group = $metadata_groups[0];
    234298    foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
    235     my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
     299    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);   
     300   
     301    # Remove prefix/suffix if requested
     302    my $remove_prefix_expr = $self->{$first_metadata_group_element . ".remove_prefix_expr"};
     303    if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {     
     304        $first_metadata_group_element_value =~ s/^$remove_prefix_expr//;
     305    }
     306    my $remove_suffix_expr = $self->{$first_metadata_group_element . ".remove_suffix_expr"};
     307    if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
     308        $first_metadata_group_element_value =~ s/$remove_suffix_expr$//;
     309    }
     310
    236311    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
    237312        # This section must be included in the classifier
    238313        $classify_section = 1;
    239314        last;
    240     }
     315    } 
    241316    }
    242317
    243318    # We're not classifying this section because it doesn't have the required metadata
    244319    return if (!$classify_section);
    245 
    246320
    247321    if (($edit_mode eq "delete") || ($edit_mode eq "update")) {
     
    260334    # Take care not to do a metadata group more than once
    261335    unless ($metadata_groups_done{$metadata_group}) {
    262         foreach my $metadata_element (split(/\;/, $metadata_group)) {
     336        foreach my $metadata_element (split(/\;/, $metadata_group)) {       
     337        my $remove_prefix_expr = $self->{$metadata_element . ".remove_prefix_expr"};
     338        my $remove_suffix_expr = $self->{$metadata_element . ".remove_suffix_expr"};
    263339        my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
    264340        foreach my $metadata_value (@metadata_values) {
     
    266342            $metadata_value =~ s/^\s*//;
    267343            $metadata_value =~ s/\s*$//;
     344
     345            # Remove prefix/suffix if requested
     346            if (defined $remove_prefix_expr && $remove_prefix_expr ne "") {
     347            $metadata_value =~ s/^$remove_prefix_expr//;
     348            }
     349            if (defined $remove_suffix_expr && $remove_suffix_expr ne "") {
     350            $metadata_value =~ s/$remove_suffix_expr$//;
     351            }
    268352
    269353            # Convert the metadata value from a UTF-8 string to a Unicode string
     
    327411    # Create a mapping from metadata value to OID
    328412    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
    329     my %metadata_value_to_OIDs_hash = ();
     413    my %metadata_value_to_OIDs_hash = ();   
    330414    foreach my $OID (@OIDs)
    331415    {
     
    342426
    343427    # Partition the values (if necessary)
    344     my $partition_type_within_level = $self->{"partition_type_within_level"};
     428    my $partition_type_within_level = $self->{$metadata_group . ".partition_type_within_level"};
    345429    if ($partition_type_within_level =~ /^per_letter$/i) {
    346430    # Generate one hlist for each letter
     
    368452    $classifier_node->{'childtype'} = "HList";
    369453    }
    370 
     454    elsif ($partition_type_within_level =~ /^per_letter_fixed_size$/i) {
     455    # Generate hlist based on the first letter of the metadata value (like per_letter) but with restriction on the partition size
     456    # If a letter has fewer items than specified by the "partition_size_within_level", then group them together if possible
     457    # If a letter has more items than specified, split into several hlists.
     458    # Depends on the bookshelf_type, one item can be either a document (when bookshelf_type is "never") or a metadata value (otherwise)
     459    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};       
     460    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
     461    my $bookshelf_type = $self->{$metadata_group . ".bookshelf_type"};
     462   
     463    # Separate values by their first letter, each form a bucket, like the per_letter partition type
     464    my $last_partition = substr($sortedmetadata_values[0], 0, 1);
     465    my %partition_buckets = ();
     466    my @metadata_values_in_bucket = ();
     467    my $num_items_in_bucket = 0;
     468    foreach my $metadata_value (@sortedmetadata_values) {       
     469        my $metadata_valuepartition = substr($metadata_value, 0, 1);
     470        if ($metadata_valuepartition ne $last_partition) {
     471        my @temp_array = @metadata_values_in_bucket;
     472        # Cache the values that belong to this bucket, and the number of items in this bucket, not necessary to be the same number as the metadata values
     473        my %partition_info = ();       
     474        $partition_info{'metadata_values'} = \@temp_array;
     475        $partition_info{'size'} = $num_items_in_bucket;     
     476        $partition_buckets{$last_partition} = \%partition_info;     
     477       
     478        @metadata_values_in_bucket = ($metadata_value);
     479        $num_items_in_bucket = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
     480        $last_partition = $metadata_valuepartition;
     481        } else {
     482        $num_items_in_bucket += $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : scalar(@metadata_values_in_bucket);
     483        push (@metadata_values_in_bucket, $metadata_value);
     484        }
     485    }
     486    # Last one
     487    my %partition_info = ();
     488    $partition_info{'metadata_values'} = \@metadata_values_in_bucket;
     489    $partition_info{'size'} = $num_items_in_bucket;
     490    $partition_buckets{$last_partition} = \%partition_info;
     491       
     492    my @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));   
     493    for (my $i = 0; $i < scalar(@partition_keys) - 1; $i++) {
     494        my $partition = $partition_keys[$i];
     495        my $items_in_partition = $partition_buckets{$partition}->{'size'};             
     496        # Merge small buckets together, but keep the numeric bucket apart
     497        if ($items_in_partition < $partition_size_within_level) {
     498        my $items_in_next_partition = $partition_buckets{$partition_keys[$i+1]}->{'size'};
     499        if ($items_in_partition + $items_in_next_partition <= $partition_size_within_level
     500            && !(($partition =~ /^[^0-9]/ && $partition_keys[$i+1] =~ /^[0-9]/)
     501             || ($partition =~ /^[0-9]/ && $partition_keys[$i+1] =~ /^[^0-9]/))) {
     502            foreach my $metadata_value_to_merge (@{$partition_buckets{$partition}->{'metadata_values'}}) {
     503            push(@{$partition_buckets{$partition_keys[$i+1]}->{'metadata_values'}}, $metadata_value_to_merge);
     504            }           
     505            $partition_buckets{$partition_keys[$i+1]}->{'size'} += $items_in_partition;
     506            delete $partition_buckets{$partition};                     
     507        }
     508        }
     509    }
     510    @partition_keys = $self->sort_metadata_values_array(keys(%partition_buckets));
     511   
     512    # Add partitions, and divide big bucket into several
     513    my $last_partition_end = "";
     514    my $partition_start = "";
     515    foreach my $partition (@partition_keys) {
     516        my @metadata_values = $self->sort_metadata_values_array(@{$partition_buckets{$partition}->{'metadata_values'}});
     517        my $items_in_partition = $partition_buckets{$partition}->{'size'};
     518        $partition_start = $self->generate_partition_start($metadata_values[0], $last_partition_end, $self->{"partition_name_length"});
     519       
     520        if ($items_in_partition > $partition_size_within_level) {   
     521        my $items_done = 0;
     522        my %metadata_values_to_OIDs_subhashes = ();
     523        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
     524            my $metadata_value = $metadata_values[$i];
     525            # If the bookshelf_type is "never", count the documents, otherwise count the distinct metadata values
     526            my $items_for_this_md_value = $bookshelf_type eq "never" ? scalar(@{$metadata_value_to_OIDs_hash{$metadata_value}}) : 1;
     527
     528            my $partitionend = $self->generate_partition_end($metadata_value, $partition_start, $self->{"partition_name_length"});
     529            my $partitionname = $partition_start;
     530            if ($partitionend ne $partition_start) {
     531            $partitionname = $partitionname . "-" . $partitionend;
     532            }
     533           
     534            # Start a new partition
     535            if ($items_done + $items_for_this_md_value > $partition_size_within_level && $items_done != 0) {
     536            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
     537            $last_partition_end = $partitionend;           
     538            $partition_start = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
     539            $items_done = 0;
     540            %metadata_values_to_OIDs_subhashes = ();
     541            }
     542           
     543            # If bookshelf_type is "never" and the current metadata value holds too many items, need to split into several partitions
     544                    if ($bookshelf_type eq "never" && $items_for_this_md_value > $partition_size_within_level) {
     545            my $partitionname_for_this_value = $self->generate_partition_start($metadata_value, $last_partition_end, $self->{"partition_name_length"});
     546            # Get the number of partitions needed for this value
     547            my $num_splits = int($items_for_this_md_value / $partition_size_within_level);
     548            $num_splits++ if ($items_for_this_md_value / $partition_size_within_level > $num_splits);
     549
     550            my @OIDs_for_this_value = @{$metadata_value_to_OIDs_hash{$metadata_value}};
     551            for (my $i = 0; $i < $num_splits; $i++) {
     552                my %OIDs_subhashes_for_this_value = ();
     553                my @OIDs_for_this_partition = ();
     554                for (my $d = $i * $partition_size_within_level; $d < (($i+1) * $partition_size_within_level > $items_for_this_md_value ? $items_for_this_md_value : ($i+1) * $partition_size_within_level); $d++) {
     555                push (@OIDs_for_this_partition, $OIDs_for_this_value[$d]);
     556                }
     557               
     558                # The last bucket might have only a few items and need to be merged with buckets for subsequent metadata values
     559                if ($i == $num_splits - 1 && scalar(@OIDs_for_this_partition) < $partition_size_within_level) {
     560                $metadata_values_to_OIDs_subhashes{$metadata_value} = \@OIDs_for_this_partition;
     561                $items_done += scalar(@OIDs_for_this_partition);
     562                next;
     563                }
     564               
     565                # Add an HList for this bucket
     566                $OIDs_subhashes_for_this_value{$metadata_value} = \@OIDs_for_this_partition;
     567                $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname_for_this_value, \%OIDs_subhashes_for_this_value);
     568                $last_partition_end = $partitionname_for_this_value;
     569            }
     570            next;
     571                    }
     572                       
     573            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
     574            $items_done += $bookshelf_type eq "never" ? scalar(@{$metadata_values_to_OIDs_subhashes{$metadata_value}}) : 1;         
     575           
     576            # The last partition
     577            if($i == scalar(@metadata_values) - 1) {
     578            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);
     579            }
     580        }       
     581        }
     582        else {
     583        # The easier case, just add a partition
     584        my %metadata_values_to_OIDs_subhashes = ();
     585        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
     586            my $metadata_value = $metadata_values[$i];
     587            $metadata_values_to_OIDs_subhashes{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};           
     588        }
     589        my $last_metadata_value = $metadata_values[scalar(@metadata_values)-1];
     590        my $partitionend = $self->generate_partition_end($last_metadata_value, $partition_start, $self->{"partition_name_length"});
     591        my $partitionname = $partition_start;
     592        if ($partitionend ne $partition_start) {
     593            $partitionname = $partitionname . "-" . $partitionend;
     594        }
     595        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_values_to_OIDs_subhashes);           
     596        $last_partition_end = $partitionend;       
     597        }
     598    }   
     599    }
    371600    else {
    372601    # Generate hlists of a certain size
     
    525754    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
    526755
    527     # If there is only one item and 'always_bookshelf' is false, add the item to the list
    528     if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
     756    # If there is only one item and 'bookshelf_type' is not always (ie. never or duplicate_only), add the item to the list
     757    if (@OIDs == 1 && $self->{$metadata_group . ".bookshelf_type"} ne "always") {
    529758        my $OID = $OIDs[0];
    530759
     
    540769        }
    541770        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
    542     }
    543 
     771    }
     772    # If 'bookshelf_type' is 'never', list all the items even if there are duplicated values
     773    elsif ($self->{$metadata_group . ".bookshelf_type"} eq "never") {
     774       $self->add_sorted_leaf_items(\@OIDs, $classifier_node);
     775    }
    544776    # Otherwise create a sublist (bookshelf) for the metadata value
    545     else {
     777    else {         
    546778        my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
    547779                      'childtype' => "VList",
     
    556788        # Otherwise just add the documents as children of this list
    557789        else {
    558         # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
    559         my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
    560         foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
    561             my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
    562             # Force a stable sort (Perl 5.6's sort isn't stable)
    563             # !! The [0] bits aren't ideal (multiple metadata values) !!
    564             @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
    565         }
    566 
    567         foreach my $OID (@OIDs) {
    568             push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
    569         }
     790        $self->add_sorted_leaf_items(\@OIDs, \%child_classifier_node);
    570791        }
    571792
     
    576797}
    577798
     799sub add_sorted_leaf_items
     800{
     801    my $self = shift(@_);
     802    my @OIDs = @{shift(@_)};
     803    my $classifier_node = shift(@_);   
     804   
     805    # Sort leaf nodes and add to list
     806    my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
     807    foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
     808    my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
     809    # Force a stable sort (Perl 5.6's sort isn't stable)
     810    # !! The [0] bits aren't ideal (multiple metadata values) !!
     811    @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
     812    }
     813
     814    foreach my $OID (@OIDs) {
     815    push(@{$classifier_node->{'contains'}}, { 'OID' => $OID });
     816    }
     817}
     818
    578819
    579820sub sort_metadata_values_array
Note: See TracChangeset for help on using the changeset viewer.