Changeset 34840


Ignore:
Timestamp:
2021-02-13T23:37:22+13:00 (3 years ago)
Author:
davidb
Message:

Changed to apply extra-metadata before trying to work out doc-id. This trial change is realted to JSONSparqlResultPlugin (which inherits from SplitTextFile), but want to use metadata fields in the JSON file (country+year combined) to form the ID of the document, so the extra-metadata needs to be processed before the add_OID call is made

File:
1 copied

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm--for-gs311

    r34646 r34840  
    114114sub get_default_split_exp {
    115115    return q^\n\s*\n^;
     116}
     117
     118sub split_text_into_segments {
     119    my $self = shift (@_);
     120    my ($textref) = @_;
     121   
     122       
     123    # Split the text into several smaller segments
     124    my $split_exp = $self->{'split_exp'};
     125    my @tmp  = split(/$split_exp/i, $$textref);
     126   
     127    my @segments =();
     128    ## get rid of empty segments
     129    foreach my $seg (@tmp){
     130    if ($seg ne ""){
     131        push @segments, $seg;
     132    }
     133    }
     134
     135    return \@segments;
    116136}
    117137
     
    160180   
    161181    # Split the text into several smaller segments
    162     my $split_exp = $self->{'split_exp'};
    163         my @tmp  = split(/$split_exp/i, $text);
    164     my @segments =();
    165     ## get rid of empty segments
    166     foreach my $seg (@tmp){
    167         if ($seg ne ""){
    168         push @segments, $seg;
    169         }
    170     }
    171 
    172     print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
     182#   my $split_exp = $self->{'split_exp'};
     183#        my @tmp  = split(/$split_exp/i, $text);
     184#   my @segments =();
     185#   ## get rid of empty segments
     186#   foreach my $seg (@tmp){
     187#       if ($seg ne ""){
     188#       push @segments, $seg;
     189#       }
     190#   }
     191#
     192#       print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
     193#       if $self->{'verbosity'};
     194#   
     195#   $self->{'split_segments'}->{$file} = \@segments;
     196#       return  scalar(@segments);
     197   
     198        my $segments = $self->split_text_into_segments(\$text);
     199
     200        my $num_segments = scalar(@$segments);
     201   
     202        print $outhandle "SplitTextFile found $num_segments documents in $filename\n"
    173203        if $self->{'verbosity'};
    174204   
    175     $self->{'split_segments'}->{$file} = \@segments;
    176    
    177     return  scalar(@segments);
     205    $self->{'split_segments'}->{$file} = $segments;
     206
     207        return  $num_segments;
    178208}
    179209
     
    243273    #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
    244274
     275    # include any metadata passed in from previous plugins
     276    # note that this metadata is associated with the top level section
     277    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     278   
    245279    # Calculate a "base" document ID.
    246280    if (!defined $id) {
     
    248282    }
    249283   
    250     # include any metadata passed in from previous plugins
    251     # note that this metadata is associated with the top level section
    252     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     284#   # include any metadata passed in from previous plugins
     285#   # note that this metadata is associated with the top level section
     286#   $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    253287
    254288    # do plugin specific processing of doc_obj
Note: See TracChangeset for help on using the changeset viewer.