Changeset 26267 for main

Show
Ignore:
Timestamp:
27.09.2012 14:24:22 (7 years ago)
Author:
kjdon
Message:

added filter_metadata and filer_regex to filter documents going into the classifier. Can use these to restrict which documents get added to the classifier. eg If have several different DocType? metadatas, can build a classifier on just one/some of the doctypes

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/List.pm

    r24193 r26267  
    108108    'desc' => "{List.use_hlist_for}", 
    109109    'type' => "string" }, 
     110      {'name' => "filter_metadata", 
     111       'desc' => "{List.filter_metadata}", 
     112       'type' => "metadata"}, 
     113      {'name' => "filter_regex", 
     114       'desc' => "{List.filter_regex}", 
     115       'type' => "regexp"}, 
     116       
    110117      { 'name' => "removeprefix", 
    111118    'desc' => "{BasClas.removeprefix}", 
     
    302309    } 
    303310    $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1; 
     311    # check against filter here 
     312    if ($self->{'filter_metadata'}) { 
     313    #print STDERR "filtering documents on $self->{'filter_metadata'}\n"; 
     314    my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'}); 
     315    return unless defined $meta; 
     316    if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) { 
     317        #print STDERR "doesn't match regex\n"; 
     318        return; 
     319 
     320    } 
     321    } 
     322    # if we get here, we have passed the test for filtering 
    304323    # If "-classify_sections" is set, classify every section of the document 
    305324    if ($self->{'classify_sections'}) { 
     
    324343    my @metadata_groups = @{$self->{'metadata_groups'}}; 
    325344 
     345     
    326346    # Only classify the section if it has a value for one of the metadata elements in the first group 
    327347    my $classify_section = 0;