Ignore:
Timestamp:
2022-10-03T11:14:38+13:00 (19 months ago)
Author:
kjdon
Message:

updated DateList with the changes added for Heritage nz. allow documents with invalid or missing dates to be included in the classifier - if the option is set - in a 'invalid date' partition.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/classify/DateList.pm

    r34109 r36670  
    3434# -metadata, use a different metadata for the date (instead of Date), still expects yyyymmdd format. this affects display cos greenstone displays Date metadata as dd month yyyy, whereas any other date metadata is displayed as yyyymmdd - this needs fixing
    3535# -sort specifies an additional metadata to use in sorting, will take affect when two docs have the same date.
    36 
     36# -reverse_sort - sort in reverse order
     37# -no_special_formatting - makes the list a VList instead of a DateList - don't display Months down the side of the list
     38# -valid_date_regex - what constitutes a valid date? deft is \d\d\d\d. eg for Heritage, want to customise this to allow dates like 198?
     39# -allow_invalid_dates - do we include in the classifier documents with invalid dates? deft = no.
     40# -invalid_date_partition_name - if docs with invalid dates are included, they get put into one bucket. this gives the name of that bucket. eg "no date"
    3741package DateList;
    3842
     
    7276    'desc' => "{DateList.no_special_formatting}",
    7377    'type' => "flag",
    74     'reqd' => "no" }
    75      
     78        'reqd' => "no" },
     79      { 'name' => "valid_date_regex",
     80    'desc' => "{DateList.valid_date_regex}",
     81        'type' => "regexp",
     82        'deft' => "\\d\\d\\d\\d",
     83        'reqd' => "no" },     
     84      { 'name' => "allow_invalid_dates",
     85    'desc' => "{DateList.allow_invalid_dates}",
     86    'type' => "flag",
     87        'reqd' => "no" },
     88       { 'name' => "invalid_date_partition_name",
     89         'desc' => "{DateList.invalid_date_partition_name}",
     90         'type' => "string",
     91         'deft' => "No Date" }     
    7692      ];
    7793
     
    137153    # find the first available metadata
    138154    my $date;
     155    my $invalid = 0;
     156    my $validre = $self->{'valid_date_regex'};
     157
    139158    foreach my $m (@{$self->{'meta_list'}}) {
    140159    $date = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $m);
    141160    last if defined $date;
    142     } 
    143    
     161    }
     162
    144163    if (!defined $date || $date eq "") {
    145     # if this document doesn't contain Date element we won't
    146     # include it in this classification
    147     print $outhandle "DateList: $doc_OID has no date, not including it\n" if $verbosity >=2;
    148     return;
     164        if (!$self->{'allow_invalid_dates'}) {
     165            # if this document doesn't contain Date element we won't
     166            # include it in this classification
     167            print $outhandle "DateList: $doc_OID has no date, not including it\n" if $verbosity >=2;
     168            return;
     169        } else {
     170            $invalid = 1;
     171            $date = "INVALID";
     172        }
    149173    }
    150174
    151175    # sanity check date
    152     if ($date !~ /^\d\d\d\d.*/) {
    153     print $outhandle "DateList: $doc_OID date: '$date' malformed: expected it to start with yyyy; not classifying\n" if $verbosity >=2;
    154     return;
    155     }
    156     if ($self->{'bymonth'}) {
     176    #if ($date !~ /^\d\d\d\d.*/) {
     177    if ($date !~ /^$validre.*/) {
     178        if (!$self->{'allow_invalid_dates'}) {
     179            print $outhandle "DateList: $doc_OID date: '$date' malformed: expected it to start with $validre; not classifying\n" if $verbosity >=2;
     180            return;
     181        }
     182        else {
     183            $invalid = 1;
     184            $date = "INVALID";
     185        }
     186    }
     187    if ($self->{'bymonth'} && !$invalid) {
    157188    # check that we have valid month - if not, set it to 00 == undefined
    158     if ($date !~ /^\d\d\d\d-?\d\d/) {
     189    if ($date !~ /^$validre-?\d\d/) {
    159190    print $outhandle "DateList $doc_OID date: '$date' has no month (expecting yyyymm... or yyyy-mm...), setting date to yyyy-00\n" if $verbosity >=2;
    160     $date =~ s/^(\d\d\d\d).*$/$1-00/;
     191    $date =~ s/^($validre).*$/$1-00/;
    161192    } else {
    162         my ($year, $month) = $date =~ /^(\d\d\d\d)-?(\d\d)/;
     193        my ($year, $month) = $date =~ /^($validre)-?(\d\d)/;
    163194        if ($month > 12) {
    164195        print $outhandle "DateList $doc_OID date: '$date' has invalid month, setting date to $year-00\n" if $verbosity >=2;
     
    236267    }
    237268
    238    
     269    my $validre = $self->{'valid_date_regex'};
     270    my $invalid_bucket = $self->{'invalid_date_partition_name'};
    239271
    240272    if ($self->{'bymonth'}) {
     
    299331    foreach my $classification (@$classlistref) {
    300332        my $date = $self->{'list'}->{$classification};
    301         $date =~ s/^(\d\d\d\d).*$/$1/;
     333            if ($date =~ /^INVALID/) {
     334                $date = $invalid_bucket;
     335            } else {
     336                $date =~ s/^($validre).*$/$1/;
     337            }
    302338        $classhash->{$date} = [] unless defined $classhash->{$date};
    303339        push (@{$classhash->{$date}}, $classification);
     
    305341         
    306342    }
    307    
     343
    308344    # only compact the list if nogroup not specified
    309345    if (!$self->{'nogroup'}) {
     346        #print STDERR "compacting list\n";
    310347    $classhash = $self->compactlist ($classhash);
    311348    }
     
    346383    }
    347384    foreach my $subsection (@subsectionlist) {
     385        if ($subsection eq  $self->{'invalid_date_partition_name'}) {
     386            # leave this one as is
     387        $compactedhash->{$subsection} = $classhashref->{$subsection};
     388            next;
     389        }
     390  #      print STDERR "in sub $subsection\n";
    348391    $currentfirstdate = $subsection if $currentfirstdate eq "";
    349392    if ((scalar (@currentOIDs) < $min) ||
Note: See TracChangeset for help on using the changeset viewer.