Ignore:
Timestamp:
2024-02-13T17:15:40+13:00 (4 months ago)
Author:
davidb
Message:

New mapping_metadata_file rule-table functionalilty

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BaseImporter.pm

    r37048 r38742  
    9999    'type' => "string",
    100100    'reqd' => "no" },
     101      { 'name' => "metadata_mapping_file",
     102    'desc' => "{BaseImporter.metadata_mapping_file}",
     103    'type' => "string",
     104    #'deft' => "metadata_mapping_rules.csv", # commenting out, explicitly giving this as the default means every plugin automatically activates it
     105    'deft' => "",
     106    'reqd' => "no" },     
    101107      { 'name' => "OIDtype",
    102108    'desc' => "{import.OIDtype}",
     
    155161        'args'     => $arguments };
    156162
     163sub setup_metadata_mapping_rules
     164{
     165    my $self = shift (@_); 
     166    my ($metadata_mapping_full_filename) = @_;
     167
     168    my $csv_file = &util::read_utf8_textfile($metadata_mapping_full_filename);
     169
     170    my @csv_lines = split('\n',$csv_file);
     171
     172    my $line_num = 0;
     173
     174    my $metadata_mapping_rules = [];
     175   
     176    foreach my $csv_line (@csv_lines) {
     177    chomp($csv_line);
     178
     179    $line_num++;
     180   
     181    next if ($csv_line =~ m/^\s*$/);
     182    next if ($csv_line =~ m/^#/);
     183
     184    # my @tsv_entries = split('\t',$tsv_line);
     185    my @csv_entries = split(/\s*,\s*/,$csv_line);
     186
     187    my $num_csv_entries = scalar(@csv_entries);
     188
     189    # print STDERR join(",",@tsv_entries), "\n\n";
     190   
     191    if (($num_csv_entries == 5) || ($num_csv_entries >= 6 && $csv_entries[5] =~ m/^#/)) {
     192        my $src_metadata_name  = $csv_entries[0];
     193        (my $src_metadata_regex = $csv_entries[1]) =~ s/(^\/)|(\/$)//g; #assigns as a /qr (quoted regex) and then strips off leading and trailing slash
     194        (my $dst_metadata_regex = $csv_entries[2]) =~ s/(^\/)|(\/$)//g; #as line above
     195        (my $regex_modifiers    = $csv_entries[3]) =~ s/(^')|('$)//g;   #similar to above but strips out single quotes
     196        my $dst_metadata_name  = $csv_entries[4];
     197
     198        # If the last entry has a '# comments' after it, needs a bit more string manipulation
     199        # to tidy it up
     200        $dst_metadata_name =~ s/#.*$//;
     201        $dst_metadata_name =~ s/\s*$//;
     202       
     203        my $regex_sub = "s/$src_metadata_regex/$dst_metadata_regex/$regex_modifiers";
     204
     205        my $metadata_mapping_rule_rec = {
     206        "src_metadata_name" => $src_metadata_name,
     207        "regex_sub"         => $regex_sub,
     208        "dst_metadata_name" => $dst_metadata_name           
     209        };
     210           
     211        push(@$metadata_mapping_rules,$metadata_mapping_rule_rec);
     212    }
     213    else {
     214        print STDERR "Warning: syntax error in $metadata_mapping_full_filename, line $line_num\n";
     215        print STDERR "  $csv_line\n";
     216        print STDERR "Did not contain 5 tab-delimited entries\n";
     217    }
     218       
     219    }
     220
     221    if (scalar(@$metadata_mapping_rules)>0) {
     222    $self->{'metadata_mapping_rules'} = $metadata_mapping_rules;
     223    }
     224}
     225
    157226sub new {
    158227
     
    165234
    166235    my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
    167    
     236
    168237    if ($self->{'info_only'}) {
    169238        # don't worry about any options etc
     
    205274    }
    206275
    207     return bless $self, $class;
     276    my $blessed_self = bless $self, $class;
     277   
     278    if ($blessed_self->{'metadata_mapping_file'}) {
     279    my $metadata_mapping_file = $blessed_self->{'metadata_mapping_file'};
     280    my $mmf_full_filename; 
     281    $mmf_full_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $metadata_mapping_file);
     282    if (!-e $mmf_full_filename) {
     283        my $mmf_full_col_filename = $mmf_full_filename;
     284        $mmf_full_filename = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $metadata_mapping_file);
     285
     286        if (!-e $mmf_full_filename) {
     287        my $outhandle = $blessed_self->{'outhandle'};
     288        print STDERR "\nBaseImporter Error: Can't locate metadata_mapping_file '$metadata_mapping_file'\n";
     289        print STDERR "This file should be in $mmf_full_col_filename or $mmf_full_filename\n";
     290        $blessed_self->print_txt_usage("");  # Use default resource bundle
     291        print STDERR "\nBaseImporter Error: Can't locate metadata_mapping_file '$metadata_mapping_file'\n";
     292        print STDERR "This file should be in $mmf_full_col_filename or $mmf_full_filename\n";
     293        die "\n";
     294        }
     295    }
     296
     297    $blessed_self->{'metadata_mapping_full_filename'} = $mmf_full_filename;
     298    $blessed_self->setup_metadata_mapping_rules($mmf_full_filename);
     299    }
     300
     301   
     302    return $blessed_self;
    208303
    209304}
     
    688783}
    689784
     785
     786sub apply_metadata_mapping_file {
     787    my $self = shift (@_); 
     788    my ($doc_obj) = @_;
     789
     790    my $verbosity = $self->{'verbosity'};
     791    my $outhandle = $self->{'outhandle'};
     792   
     793    my $top_section = $doc_obj->get_top_section();
     794
     795    my $metadata_mapping_rules = $self->{'metadata_mapping_rules'};
     796
     797    my $transient_metadata_names = {};
     798    my $new_metadata_names = {};
     799
     800    print $outhandle "  Applying metadata_mapping_file to document\n";
     801   
     802    foreach my $metadata_mapping_rule_rec (@$metadata_mapping_rules) {
     803       
     804    my $src_metadata_name = $metadata_mapping_rule_rec->{"src_metadata_name"};
     805    my $dst_metadata_name = $metadata_mapping_rule_rec->{"dst_metadata_name"};
     806
     807    my $regex_sub = $metadata_mapping_rule_rec->{"regex_sub"};
     808
     809    my $metadata_vals = $doc_obj->get_metadata($top_section,$src_metadata_name);
     810
     811    foreach my $metadata_val (@$metadata_vals) {
     812
     813        my $store_metadata_val = $metadata_val;
     814       
     815        if ($verbosity >= 4) {
     816        print $outhandle "    Testing for match with: \$metadata_val =~ $regex_sub\n"
     817        }
     818        eval ( "\$metadata_val =~ $regex_sub" );
     819        if ($@) {
     820        warn "$@";
     821        }
     822        else {
     823        # print STDERR "**** after metadata_val = $metadata_val\n\n";
     824        if ($verbosity >=2) {
     825            if ($metadata_val ne $store_metadata_val) {
     826
     827            if ($verbosity >= 3) {
     828                print $outhandle "      Transformed metadata to $dst_metadata_name: '$store_metadata_val' -> '$metadata_val'\n"
     829            }
     830            if ($verbosity >= 2 && ($dst_metadata_name !~ m/^_transient/)) {
     831                print $outhandle "    Added new metadata $dst_metadata_name: '$metadata_val'\n"
     832            }
     833            }
     834        }
     835        $doc_obj->add_utf8_metadata($top_section,$dst_metadata_name,$metadata_val);
     836
     837        if ($dst_metadata_name =~ m/^_transient/) {
     838            $transient_metadata_names->{$dst_metadata_name} = 1;
     839        }
     840        else {
     841            if ($metadata_val ne $store_metadata_val) {
     842            $new_metadata_names->{$dst_metadata_name} = 1;
     843            }
     844        }       
     845        }
     846    }
     847   
     848    }
     849
     850    if ($verbosity >=1) {
     851    print $outhandle "  Number of metadata_mapping_file transformations for this document: ", scalar(keys %$new_metadata_names), "\n";
     852
     853    if ($verbosity >=2) {
     854        print $outhandle "  Generated metadata names: ", join(", ",keys %$new_metadata_names), "\n";
     855    }
     856    }
     857   
     858    # Now remove any of dst_metadata_name which began _transient
     859    foreach my $transient_metadata_name (keys %$transient_metadata_names) {
     860    #print STDERR "transient_metadata_name = $transient_metadata_name\n";   
     861    $doc_obj->delete_metadata($top_section,$transient_metadata_name);
     862    }
     863}
     864
    690865sub post_process_doc_obj {
    691866    my $self = shift (@_); 
    692867    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    693868
     869    if ($self->{'metadata_mapping_rules'}) {
     870    $self->apply_metadata_mapping_file($doc_obj);
     871    }
     872   
    694873    return 1;
    695874}
     
    820999    return undef unless $self->can_process_this_file($filename_full_path);
    8211000   
    822     #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
     1001    #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
    8231002    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
    8241003    #print STDERR "**** AFTER READ INTO DOC OBJ: $file\n";
Note: See TracChangeset for help on using the changeset viewer.