root/gsdl/trunk/perllib/classify/GenericList.pm @ 18455

Revision 18455, 19.7 KB (checked in by davidb, 12 years ago)

Addition of 'edit_mode' parameter to classify(). This can be either 'add' 'delete' or 'reindex' (should think about renaming the last one to something more appropriate, e.g. update).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# GenericList.pm -- A general and flexible list classifier with most of
4#                   the abilities of AZCompactList, and better Unicode,
5#                   metadata and sorting capabilities.
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
12#
13# Copyright (C) 2005 New Zealand Digital Library Project
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, write to the Free Software
27# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28#
29# TO DO: - Remove punctuation from metadata values before sorting.
30#        - Add an AZCompactList-style hlist option?
31#
32###########################################################################
33
34package GenericList;
35
36
37use BaseClassifier;
38
39use strict;
40
41
42sub BEGIN {
43    @GenericList::ISA = ('BaseClassifier');
44}
45
46
47my $arguments =
48    [ { 'name' => "metadata",
49    'desc' => "{GenericList.metadata}",
50    'type' => "metadata",
51    'reqd' => "yes" },
52
53      # The interesting options
54      { 'name' => "always_bookshelf_last_level",
55    'desc' => "{GenericList.always_bookshelf_last_level}",
56    'type' => "flag" },
57      { 'name' => "classify_sections",
58    'desc' => "{GenericList.classify_sections}",
59    'type' => "flag" },
60      { 'name' => "partition_type_within_level",
61    'desc' => "{GenericList.partition_type_within_level}",
62    'type' => "string",
63    'deft' => "none" },
64      { 'name' => "partition_size_within_level",
65    'desc' => "{GenericList.partition_size_within_level}",
66    'type' => "string" },
67      { 'name' => "partition_name_length",
68    'desc' => "{GenericList.partition_name_length}",
69    'type' => "string" },
70      { 'name' => "sort_leaf_nodes_using",
71    'desc' => "{GenericList.sort_leaf_nodes_using}",
72    'type' => "metadata",
73    'deft' => "Title" },
74      { 'name' => "sort_using_unicode_collation",
75    'desc' => "{GenericList.sort_using_unicode_collation}",
76    'type' => "flag" },
77      { 'name' => "use_hlist_for",
78    'desc' => "{GenericList.use_hlist_for}",
79    'type' => "string" } ];
80
81my $options = { 'name'     => "GenericList",
82        'desc'     => "{GenericList.desc}",
83        'abstract' => "no",
84        'inherits' => "Yes",
85        'args'     => $arguments };
86
87
88sub new
89{
90    my ($class) = shift(@_);
91    my ($classifierslist, $inputargs, $hashArgOptLists) = @_;
92    push(@$classifierslist, $class);
93
94    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
95    push(@{$hashArgOptLists->{"OptList"}}, $options);
96
97    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
98
99    if ($self->{'info_only'}) {
100    # don't worry about any options etc
101    return bless $self, $class;
102    }
103
104    # The metadata elements to use (required)
105    if (!$self->{'metadata'}) {
106    die "Error: No metadata fields specified for GenericList.\n";
107    }
108    my @metadata_groups = split(/\//, $self->{'metadata'});
109    $self->{'metadata_groups'} = \@metadata_groups;
110
111    # The classifier button name (default: the first metadata element specified)
112    if (!$self->{'buttonname'}) {
113    my $first_metadata_group = $metadata_groups[0];
114    my $first_metadata_element = (split(/\;/, $first_metadata_group))[0];
115    $self->{'buttonname'} = $self->generate_title_from_metadata($first_metadata_element);
116    }
117
118    # Whether to group single items into a bookshelf (must be true for all metadata fields except the last)
119    foreach my $metadata_group (@metadata_groups) {
120    $self->{$metadata_group . ".always_bookshelf"} = "t";
121    }
122    if (!$self->{'always_bookshelf_last_level'}) {
123    # Default: leave leaf nodes ungrouped (equivalent to AZCompactList -mingroup 2)
124    my $last_metadata_group = $metadata_groups[$#metadata_groups];
125    $self->{$last_metadata_group . ".always_bookshelf"} = "f";
126    }
127
128    # Whether to use an hlist or a vlist for each level in the hierarchy (default: vlist)
129    foreach my $metadata_group (@metadata_groups) {
130    $self->{$metadata_group . ".list_type"} = "VList";
131    }
132    foreach my $metadata_group (split(/\,/, $self->{'use_hlist_for'})) {
133    $self->{$metadata_group . ".list_type"} = "HList";
134    }
135
136    # How the items are grouped into partitions (default: no partition)
137    if (!$self->{"partition_type_within_level"}) {
138    $self->{"partition_type_within_level"} = "none";
139    }
140
141    # The number of items in each partition
142    if (!$self->{'partition_size_within_level'}) {
143    # Default: 20
144    foreach my $metadata_group (@metadata_groups) {
145        $self->{$metadata_group . ".partition_size_within_level"} = 20;
146    }
147    }
148    else {
149    my @partition_size_within_levellist = split(/\//, $self->{'partition_size_within_level'});
150
151    # Assign values based on the partition_size_within_level parameter
152    foreach my $metadata_group (@metadata_groups) {
153        my $partition_size_within_levelelem = shift(@partition_size_within_levellist);
154        if (defined($partition_size_within_levelelem)) {
155        $self->{$metadata_group . ".partition_size_within_level"} = $partition_size_within_levelelem;
156        }
157        else {
158        $self->{$metadata_group . ".partition_size_within_level"} = $self->{$metadata_groups[0] . ".partition_size_within_level"};
159        }
160    }
161    }
162
163    # The metadata elements to use to sort the leaf nodes (default: Title)
164    my @sort_leaf_nodes_using_metadata_groups = ( "Title" );
165    if ($self->{'sort_leaf_nodes_using'}) {
166    @sort_leaf_nodes_using_metadata_groups = split(/\|/, $self->{'sort_leaf_nodes_using'});
167    }
168    $self->{'sort_leaf_nodes_using_metadata_groups'} = \@sort_leaf_nodes_using_metadata_groups;
169
170    # Create an instance of the Unicode::Collate object if better Unicode sorting is desired
171    if ($self->{'sort_using_unicode_collation'}) {
172    # To use this you first need to download the allkeys.txt file from
173        # http://www.unicode.org/Public/UCA/latest/allkeys.txt and put it in the Perl
174        # Unicode/Collate directory.
175    require Unicode::Collate;
176    $self->{'unicode_collator'} = Unicode::Collate->new();
177    }
178
179    # An empty array for the document OIDs
180    $self->{'OIDs'} = [];
181
182    return bless $self, $class;
183}
184
185
186sub init
187{
188    # Nothing to do...
189}
190
191
192# Called for each document in the collection
193sub classify
194{
195    my $self = shift(@_);
196    my ($doc_obj,$edit_mode) = @_;
197
198    # If "-classify_sections" is set, classify every section of the document
199    if ($self->{'classify_sections'}) {
200    my $section = $doc_obj->get_next_section($doc_obj->get_top_section());
201    while (defined $section) {
202        $self->classify_section($doc_obj, $doc_obj->get_OID() . ".$section", $section, $edit_mode);
203        $section = $doc_obj->get_next_section($section);
204    }
205    }
206    # Otherwise just classify the top document section
207    else {
208    $self->classify_section($doc_obj, $doc_obj->get_OID(), $doc_obj->get_top_section(), $edit_mode);
209    }
210}
211
212
213sub classify_section
214{
215    my $self = shift(@_);
216    my ($doc_obj,$section_OID,$section,$edit_mode) = @_;
217
218    my @metadata_groups = @{$self->{'metadata_groups'}};
219
220    # Only classify the section if it has a value for one of the metadata elements in the first group
221    my $classify_section = 0;
222    my $first_metadata_group = $metadata_groups[0];
223    foreach my $first_metadata_group_element (split(/\;/, $first_metadata_group)) {
224    my $first_metadata_group_element_value = $doc_obj->get_metadata_element($section, $first_metadata_group_element);
225    if (defined($first_metadata_group_element_value) && $first_metadata_group_element_value ne "") {
226        # This section must be included in the classifier
227        $classify_section = 1;
228        last;
229    }
230    }
231
232    # We're not classifying this section because it doesn't have the required metadata
233    return if (!$classify_section);
234
235    if ($edit_mode eq "delete") {
236    $self->oid_array_delete($section_OID,'OIDs');
237    return;
238    }
239   
240    # Otherwise, include this section in the classifier
241    push(@{$self->{'OIDs'}}, $section_OID);
242
243    # Create a hash for the metadata values of each metadata element we're interested in
244    my %metadata_groups_done = ();
245    foreach my $metadata_group (@metadata_groups, @{$self->{'sort_leaf_nodes_using_metadata_groups'}}) {
246    # Take care not to do a metadata group more than once
247    unless ($metadata_groups_done{$metadata_group}) {
248        foreach my $metadata_element (split(/\;/, $metadata_group)) {
249        my @metadata_values = @{$doc_obj->get_metadata($section, $metadata_element)};
250        foreach my $metadata_value (@metadata_values) {
251            # Strip leading and trailing whitespace
252            $metadata_value =~ s/^\s*//;
253            $metadata_value =~ s/\s*$//;
254
255            # Convert the metadata value from a UTF-8 string to a Unicode string
256            # This means that length() and substr() work properly
257            # We need to be careful to convert classifier node title values back to UTF-8, however
258            my $metadata_value_unicode_string = $self->convert_utf8_string_to_unicode_string($metadata_value);
259
260            # Add the metadata value into the list for this combination of metadata group and section
261            push(@{$self->{$metadata_group . ".list"}->{$section_OID}}, $metadata_value_unicode_string);
262        }
263        last if (@metadata_values > 0);
264        }
265
266        $metadata_groups_done{$metadata_group} = 1;
267    }
268    }
269}
270
271
272sub get_classify_info
273{
274    my $self = shift(@_);
275
276    # The metadata groups to classify by
277    my @metadata_groups = @{$self->{'metadata_groups'}};
278    my $first_metadata_group = $metadata_groups[0];
279
280    # The OID values of the documents to include in the classifier
281    my @OIDs = @{$self->{'OIDs'}};
282
283    # Create the root node of the classification hierarchy
284    my %classifier_node = ( 'thistype' => "Invisible",
285                'childtype' => $self->{$first_metadata_group . ".list_type"},
286                'Title' => $self->{'buttonname'},
287                'contains' => [],
288                'mdtype' => $first_metadata_group );
289
290    # Recursively create the classification hierarchy, one level for each metadata group
291    $self->add_level(\@metadata_groups, \@OIDs, \%classifier_node);
292    return \%classifier_node;
293}
294
295
296sub add_level
297{
298    my $self = shift(@_);
299    my @metadata_groups = @{shift(@_)};
300    my @OIDs = @{shift(@_)};
301    my $classifier_node = shift(@_);
302    # print STDERR "\nAdding AZ list for " . $classifier_node->{'Title'} . "\n";
303
304    my $metadata_group = $metadata_groups[0];
305    # print STDERR "Processing metadata group: " . $metadata_group . "\n";
306    # print STDERR "Number of OID values: " . @OIDs . "\n";
307
308    if (!defined($self->{$metadata_group . ".list"})) {
309    print STDERR "Warning: No metadata values assigned to $metadata_group.\n";
310    return;
311    }
312
313    # Create a mapping from metadata value to OID
314    my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
315    my %metadata_value_to_OIDs_hash = ();
316    foreach my $OID (@OIDs)
317    {
318    if ($OID_to_metadata_values_hash_ref->{$OID})
319    {
320        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
321        foreach my $metadata_value (@metadata_values)
322        {
323        push(@{$metadata_value_to_OIDs_hash{$metadata_value}}, $OID);
324        }
325    }
326    }
327    # print STDERR "Number of distinct values: " . scalar(keys %metadata_value_to_OIDs_hash) . "\n";
328
329    # Partition the values (if necessary)
330    my $partition_type_within_level = $self->{"partition_type_within_level"};
331    if ($partition_type_within_level =~ /^per_letter$/i) {
332    # Generate one hlist for each letter
333    my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
334    my %metadata_value_to_OIDs_subhash = ();
335
336    my $lastpartition = substr($sortedmetadata_values[0], 0, 1);
337    foreach my $metadata_value (@sortedmetadata_values) {
338        my $metadata_valuepartition = substr($metadata_value, 0, 1);
339
340        # Is this the start of a new partition?
341        if ($metadata_valuepartition ne $lastpartition) {
342        $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
343        %metadata_value_to_OIDs_subhash = ();
344        $lastpartition = $metadata_valuepartition;
345        }
346
347        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};       
348    }
349
350    # Don't forget to add the last partition
351    $self->add_hlist_partition(\@metadata_groups, $classifier_node, $lastpartition, \%metadata_value_to_OIDs_subhash);
352
353    # The partitions are stored in an HList
354    $classifier_node->{'childtype'} = "HList";
355    }
356
357    else {
358    # Generate hlists of a certain size
359    my $partition_size_within_level = $self->{$metadata_group . ".partition_size_within_level"};
360    if ($partition_type_within_level =~ /^constant_size$/i && scalar(keys %metadata_value_to_OIDs_hash) > $partition_size_within_level) {
361        my @sortedmetadata_values = $self->sort_metadata_values_array(keys(%metadata_value_to_OIDs_hash));
362        my $itemsdone = 0;
363        my %metadata_value_to_OIDs_subhash = ();
364        my $lastpartitionend = "";
365        my $partitionstart;
366        foreach my $metadata_value (@sortedmetadata_values) {
367        $metadata_value_to_OIDs_subhash{$metadata_value} = $metadata_value_to_OIDs_hash{$metadata_value};
368        $itemsdone++;
369        my $itemsinpartition = scalar(keys %metadata_value_to_OIDs_subhash);
370
371        # Is this the start of a new partition?
372        if ($itemsinpartition == 1) {
373            $partitionstart = $self->generate_partition_start($metadata_value, $lastpartitionend, $self->{"partition_name_length"});
374        }
375
376        # Is this the end of the partition?
377        if ($itemsinpartition == $partition_size_within_level || $itemsdone == @sortedmetadata_values) {
378            my $partitionend = $self->generate_partition_end($metadata_value, $partitionstart, $self->{"partition_name_length"});
379            my $partitionname = $partitionstart;
380            if ($partitionend ne $partitionstart) {
381            $partitionname = $partitionname . "-" . $partitionend;
382            }
383
384            $self->add_hlist_partition(\@metadata_groups, $classifier_node, $partitionname, \%metadata_value_to_OIDs_subhash);
385            %metadata_value_to_OIDs_subhash = ();
386            $lastpartitionend = $partitionend;
387        }
388        }
389
390        # The partitions are stored in an HList
391        $classifier_node->{'childtype'} = "HList";
392    }
393
394    # Otherwise just add all the values to a VList
395    else {
396        $self->add_vlist(\@metadata_groups, $classifier_node, \%metadata_value_to_OIDs_hash);
397    }
398    }
399}
400
401
402sub convert_utf8_string_to_unicode_string
403{
404    my $self = shift(@_);
405    my $utf8_string = shift(@_);
406
407    my $unicode_string = "";
408    foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
409    $unicode_string .= chr($unicode_value);
410    }
411    return $unicode_string;
412}
413
414
415sub convert_unicode_string_to_utf8_string
416{
417    my $self = shift(@_);
418    my $unicode_string = shift(@_);
419
420    my @unicode_array;
421    for (my $i = 0; $i < length($unicode_string); $i++) {
422    push(@unicode_array, ord(substr($unicode_string, $i, 1)));
423    }
424    return &unicode::unicode2utf8(\@unicode_array);
425}
426
427
428sub generate_partition_start
429{
430    my $self = shift(@_);
431    my $metadata_value = shift(@_);
432    my $lastpartitionend = shift(@_);
433    my $partition_name_length = shift(@_);
434
435    if ($partition_name_length) {
436    return substr($metadata_value, 0, $partition_name_length);
437    }
438
439    my $partitionstart = substr($metadata_value, 0, 1);
440    if ($partitionstart le $lastpartitionend) {
441    $partitionstart = substr($metadata_value, 0, 2);
442    # Give up after three characters
443    if ($partitionstart le $lastpartitionend) {
444        $partitionstart = substr($metadata_value, 0, 3);
445    }
446    }
447
448    return $partitionstart;
449}
450
451
452sub generate_partition_end
453{
454    my $self = shift(@_);
455    my $metadata_value = shift(@_);
456    my $partitionstart = shift(@_);
457    my $partition_name_length = shift(@_);
458
459    if ($partition_name_length) {
460    return substr($metadata_value, 0, $partition_name_length);
461    }
462
463    my $partitionend = substr($metadata_value, 0, length($partitionstart));
464    if ($partitionend gt $partitionstart) {
465    $partitionend = substr($metadata_value, 0, 1);
466    if ($partitionend le $partitionstart) {
467        $partitionend = substr($metadata_value, 0, 2);
468        # Give up after three characters
469        if ($partitionend le $partitionstart) {
470        $partitionend = substr($metadata_value, 0, 3);
471        }
472    }
473    }
474
475    return $partitionend;
476}
477
478
479sub add_hlist_partition
480{
481    my $self = shift(@_);
482    my @metadata_groups = @{shift(@_)};
483    my $classifier_node = shift(@_);
484    my $partitionname = shift(@_);
485    my $metadata_value_to_OIDs_hash_ref = shift(@_);
486
487    # Create an hlist partition
488    my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($partitionname),
489                  'childtype' => "VList",
490                  'contains' => [] );
491
492    # Add the children to the hlist partition
493    $self->add_vlist(\@metadata_groups, \%child_classifier_node, $metadata_value_to_OIDs_hash_ref);
494    push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
495}
496
497
498sub add_vlist
499{
500    my $self = shift(@_);
501    my @metadata_groups = @{shift(@_)};
502    my $classifier_node = shift(@_);
503    my $metadata_value_to_OIDs_hash_ref = shift(@_);
504
505    my $metadata_group = shift(@metadata_groups);
506    $classifier_node->{'mdtype'} = $metadata_group;
507
508    # Create an entry in the vlist for each value
509    foreach my $metadata_value ($self->sort_metadata_values_array(keys(%{$metadata_value_to_OIDs_hash_ref})))
510    {
511    my @OIDs = @{$metadata_value_to_OIDs_hash_ref->{$metadata_value}};
512
513    # If there is only one item and 'always_bookshelf' is false, add the item to the list
514    if (@OIDs == 1 && $self->{$metadata_group . ".always_bookshelf"} eq "f") {
515        my $OID = $OIDs[0];
516
517        # Find the offset of this metadata value
518        my $offset = 0;
519        my $OID_to_metadata_values_hash_ref = $self->{$metadata_group . ".list"};
520        my @metadata_values = @{$OID_to_metadata_values_hash_ref->{$OID}};
521        for (my $i = 0; $i < scalar(@metadata_values); $i++) {
522        if ($metadata_value eq $metadata_values[$i]) {
523            $offset = $i;
524            last;
525        }
526        }
527        push(@{$classifier_node->{'contains'}}, { 'OID' => $OID, 'offset' => $offset });
528    }
529
530    # Otherwise create a sublist (bookshelf) for the metadata value
531    else {
532        my %child_classifier_node = ( 'Title' => $self->convert_unicode_string_to_utf8_string($metadata_value),
533                      'childtype' => "VList",
534                      'contains' => [] );
535
536        # If there are metadata elements remaining, recursively apply the process
537        if (@metadata_groups > 0) {
538        my $next_metadata_group = $metadata_groups[0];
539        $child_classifier_node{'childtype'} = $self->{$next_metadata_group . ".list_type"};
540        $self->add_level(\@metadata_groups, \@OIDs, \%child_classifier_node);
541        }
542        # Otherwise just add the documents as children of this list
543        else {
544        # Sort the leaf nodes by the metadata elements specified with -sort_leaf_nodes_using
545        my @sort_leaf_nodes_usingmetadata_groups = @{$self->{'sort_leaf_nodes_using_metadata_groups'}};
546        foreach my $sort_leaf_nodes_usingmetaelem (reverse @sort_leaf_nodes_usingmetadata_groups) {
547            my $OID_to_metadata_values_hash_ref = $self->{$sort_leaf_nodes_usingmetaelem . ".list"};
548            # Force a stable sort (Perl 5.6's sort isn't stable)
549            # !! The [0] bits aren't ideal (multiple metadata values) !!
550            @OIDs = @OIDs[ sort { $OID_to_metadata_values_hash_ref->{$OIDs[$a]}[0] cmp $OID_to_metadata_values_hash_ref->{$OIDs[$b]}[0] || $a <=> $b; } 0..$#OIDs ];
551        }
552
553        foreach my $OID (@OIDs) {
554            push(@{$child_classifier_node{'contains'}}, { 'OID' => $OID });
555        }
556        }
557
558        # Add the sublist to the list
559        push(@{$classifier_node->{'contains'}}, \%child_classifier_node);
560    }
561    }
562}
563
564
565sub sort_metadata_values_array
566{
567    my $self = shift(@_);
568    my @metadata_values = @_;
569
570    if ($self->{'unicode_collator'}) {
571    return $self->{'unicode_collator'}->sort(@metadata_values);
572    }
573    else {
574    return sort(@metadata_values);
575    }
576}
577
578
5791;
Note: See TracBrowser for help on using the browser.