Ignore:
Timestamp:
2012-09-27T14:24:22+12:00 (12 years ago)
Author:
kjdon
Message:

added filter_metadata and filer_regex to filter documents going into the classifier. Can use these to restrict which documents get added to the classifier. eg If have several different DocType metadatas, can build a classifier on just one/some of the doctypes

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/List.pm

    r24193 r26267  
    108108    'desc' => "{List.use_hlist_for}",
    109109    'type' => "string" },
     110      {'name' => "filter_metadata",
     111       'desc' => "{List.filter_metadata}",
     112       'type' => "metadata"},
     113      {'name' => "filter_regex",
     114       'desc' => "{List.filter_regex}",
     115       'type' => "regexp"},
     116     
    110117      { 'name' => "removeprefix",
    111118    'desc' => "{BasClas.removeprefix}",
     
    302309    }
    303310    $self->{'all_doc_OIDs'}->{$doc_obj->get_OID()} = 1;
     311    # check against filter here
     312    if ($self->{'filter_metadata'}) {
     313    #print STDERR "filtering documents on $self->{'filter_metadata'}\n";
     314    my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'filter_metadata'});
     315    return unless defined $meta;
     316    if ($self->{'filter_regex'} ne "" && $meta !~ /$self->{'filter_regex'}/) {
     317        #print STDERR "doesn't match regex\n";
     318        return;
     319
     320    }
     321    }
     322    # if we get here, we have passed the test for filtering
    304323    # If "-classify_sections" is set, classify every section of the document
    305324    if ($self->{'classify_sections'}) {
     
    324343    my @metadata_groups = @{$self->{'metadata_groups'}};
    325344
     345   
    326346    # Only classify the section if it has a value for one of the metadata elements in the first group
    327347    my $classify_section = 0;
Note: See TracChangeset for help on using the changeset viewer.