Changeset 38742 for main/trunk


Ignore:
Timestamp:
2024-02-13T17:15:40+13:00 (3 months ago)
Author:
davidb
Message:

New mapping_metadata_file rule-table functionalilty

Location:
main/trunk/greenstone2/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BaseImporter.pm

    r37048 r38742  
    9999    'type' => "string",
    100100    'reqd' => "no" },
     101      { 'name' => "metadata_mapping_file",
     102    'desc' => "{BaseImporter.metadata_mapping_file}",
     103    'type' => "string",
     104    #'deft' => "metadata_mapping_rules.csv", # commenting out, explicitly giving this as the default means every plugin automatically activates it
     105    'deft' => "",
     106    'reqd' => "no" },     
    101107      { 'name' => "OIDtype",
    102108    'desc' => "{import.OIDtype}",
     
    155161        'args'     => $arguments };
    156162
     163sub setup_metadata_mapping_rules
     164{
     165    my $self = shift (@_); 
     166    my ($metadata_mapping_full_filename) = @_;
     167
     168    my $csv_file = &util::read_utf8_textfile($metadata_mapping_full_filename);
     169
     170    my @csv_lines = split('\n',$csv_file);
     171
     172    my $line_num = 0;
     173
     174    my $metadata_mapping_rules = [];
     175   
     176    foreach my $csv_line (@csv_lines) {
     177    chomp($csv_line);
     178
     179    $line_num++;
     180   
     181    next if ($csv_line =~ m/^\s*$/);
     182    next if ($csv_line =~ m/^#/);
     183
     184    # my @tsv_entries = split('\t',$tsv_line);
     185    my @csv_entries = split(/\s*,\s*/,$csv_line);
     186
     187    my $num_csv_entries = scalar(@csv_entries);
     188
     189    # print STDERR join(",",@tsv_entries), "\n\n";
     190   
     191    if (($num_csv_entries == 5) || ($num_csv_entries >= 6 && $csv_entries[5] =~ m/^#/)) {
     192        my $src_metadata_name  = $csv_entries[0];
     193        (my $src_metadata_regex = $csv_entries[1]) =~ s/(^\/)|(\/$)//g; #assigns as a /qr (quoted regex) and then strips off leading and trailing slash
     194        (my $dst_metadata_regex = $csv_entries[2]) =~ s/(^\/)|(\/$)//g; #as line above
     195        (my $regex_modifiers    = $csv_entries[3]) =~ s/(^')|('$)//g;   #similar to above but strips out single quotes
     196        my $dst_metadata_name  = $csv_entries[4];
     197
     198        # If the last entry has a '# comments' after it, needs a bit more string manipulation
     199        # to tidy it up
     200        $dst_metadata_name =~ s/#.*$//;
     201        $dst_metadata_name =~ s/\s*$//;
     202       
     203        my $regex_sub = "s/$src_metadata_regex/$dst_metadata_regex/$regex_modifiers";
     204
     205        my $metadata_mapping_rule_rec = {
     206        "src_metadata_name" => $src_metadata_name,
     207        "regex_sub"         => $regex_sub,
     208        "dst_metadata_name" => $dst_metadata_name           
     209        };
     210           
     211        push(@$metadata_mapping_rules,$metadata_mapping_rule_rec);
     212    }
     213    else {
     214        print STDERR "Warning: syntax error in $metadata_mapping_full_filename, line $line_num\n";
     215        print STDERR "  $csv_line\n";
     216        print STDERR "Did not contain 5 tab-delimited entries\n";
     217    }
     218       
     219    }
     220
     221    if (scalar(@$metadata_mapping_rules)>0) {
     222    $self->{'metadata_mapping_rules'} = $metadata_mapping_rules;
     223    }
     224}
     225
    157226sub new {
    158227
     
    165234
    166235    my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
    167    
     236
    168237    if ($self->{'info_only'}) {
    169238        # don't worry about any options etc
     
    205274    }
    206275
    207     return bless $self, $class;
     276    my $blessed_self = bless $self, $class;
     277   
     278    if ($blessed_self->{'metadata_mapping_file'}) {
     279    my $metadata_mapping_file = $blessed_self->{'metadata_mapping_file'};
     280    my $mmf_full_filename; 
     281    $mmf_full_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $metadata_mapping_file);
     282    if (!-e $mmf_full_filename) {
     283        my $mmf_full_col_filename = $mmf_full_filename;
     284        $mmf_full_filename = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $metadata_mapping_file);
     285
     286        if (!-e $mmf_full_filename) {
     287        my $outhandle = $blessed_self->{'outhandle'};
     288        print STDERR "\nBaseImporter Error: Can't locate metadata_mapping_file '$metadata_mapping_file'\n";
     289        print STDERR "This file should be in $mmf_full_col_filename or $mmf_full_filename\n";
     290        $blessed_self->print_txt_usage("");  # Use default resource bundle
     291        print STDERR "\nBaseImporter Error: Can't locate metadata_mapping_file '$metadata_mapping_file'\n";
     292        print STDERR "This file should be in $mmf_full_col_filename or $mmf_full_filename\n";
     293        die "\n";
     294        }
     295    }
     296
     297    $blessed_self->{'metadata_mapping_full_filename'} = $mmf_full_filename;
     298    $blessed_self->setup_metadata_mapping_rules($mmf_full_filename);
     299    }
     300
     301   
     302    return $blessed_self;
    208303
    209304}
     
    688783}
    689784
     785
     786sub apply_metadata_mapping_file {
     787    my $self = shift (@_); 
     788    my ($doc_obj) = @_;
     789
     790    my $verbosity = $self->{'verbosity'};
     791    my $outhandle = $self->{'outhandle'};
     792   
     793    my $top_section = $doc_obj->get_top_section();
     794
     795    my $metadata_mapping_rules = $self->{'metadata_mapping_rules'};
     796
     797    my $transient_metadata_names = {};
     798    my $new_metadata_names = {};
     799
     800    print $outhandle "  Applying metadata_mapping_file to document\n";
     801   
     802    foreach my $metadata_mapping_rule_rec (@$metadata_mapping_rules) {
     803       
     804    my $src_metadata_name = $metadata_mapping_rule_rec->{"src_metadata_name"};
     805    my $dst_metadata_name = $metadata_mapping_rule_rec->{"dst_metadata_name"};
     806
     807    my $regex_sub = $metadata_mapping_rule_rec->{"regex_sub"};
     808
     809    my $metadata_vals = $doc_obj->get_metadata($top_section,$src_metadata_name);
     810
     811    foreach my $metadata_val (@$metadata_vals) {
     812
     813        my $store_metadata_val = $metadata_val;
     814       
     815        if ($verbosity >= 4) {
     816        print $outhandle "    Testing for match with: \$metadata_val =~ $regex_sub\n"
     817        }
     818        eval ( "\$metadata_val =~ $regex_sub" );
     819        if ($@) {
     820        warn "$@";
     821        }
     822        else {
     823        # print STDERR "**** after metadata_val = $metadata_val\n\n";
     824        if ($verbosity >=2) {
     825            if ($metadata_val ne $store_metadata_val) {
     826
     827            if ($verbosity >= 3) {
     828                print $outhandle "      Transformed metadata to $dst_metadata_name: '$store_metadata_val' -> '$metadata_val'\n"
     829            }
     830            if ($verbosity >= 2 && ($dst_metadata_name !~ m/^_transient/)) {
     831                print $outhandle "    Added new metadata $dst_metadata_name: '$metadata_val'\n"
     832            }
     833            }
     834        }
     835        $doc_obj->add_utf8_metadata($top_section,$dst_metadata_name,$metadata_val);
     836
     837        if ($dst_metadata_name =~ m/^_transient/) {
     838            $transient_metadata_names->{$dst_metadata_name} = 1;
     839        }
     840        else {
     841            if ($metadata_val ne $store_metadata_val) {
     842            $new_metadata_names->{$dst_metadata_name} = 1;
     843            }
     844        }       
     845        }
     846    }
     847   
     848    }
     849
     850    if ($verbosity >=1) {
     851    print $outhandle "  Number of metadata_mapping_file transformations for this document: ", scalar(keys %$new_metadata_names), "\n";
     852
     853    if ($verbosity >=2) {
     854        print $outhandle "  Generated metadata names: ", join(", ",keys %$new_metadata_names), "\n";
     855    }
     856    }
     857   
     858    # Now remove any of dst_metadata_name which began _transient
     859    foreach my $transient_metadata_name (keys %$transient_metadata_names) {
     860    #print STDERR "transient_metadata_name = $transient_metadata_name\n";   
     861    $doc_obj->delete_metadata($top_section,$transient_metadata_name);
     862    }
     863}
     864
    690865sub post_process_doc_obj {
    691866    my $self = shift (@_); 
    692867    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    693868
     869    if ($self->{'metadata_mapping_rules'}) {
     870    $self->apply_metadata_mapping_file($doc_obj);
     871    }
     872   
    694873    return 1;
    695874}
     
    820999    return undef unless $self->can_process_this_file($filename_full_path);
    8211000   
    822     #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
     1001    #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
    8231002    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
    8241003    #print STDERR "**** AFTER READ INTO DOC OBJ: $file\n";
  • main/trunk/greenstone2/perllib/strings.properties

    r38736 r38742  
    809809BaseImporter.no_cover_image:Do not look for a prefix.jpg file (where prefix is the same prefix as the file being processed) to associate as a cover image.
    810810
     811BaseImporter.metadata_mapping_file:Use the specified metadata mapping file to generate additional metadata for a document.  The specified comma-separated value file (csv) needs to be encoded as UTF8, and consists of a series of rules, with 5 entries per line.  The first entry in the line specifyies a source metadata value to select from the doucment being process, and the second entry is a regular expression the metadata must match for the rule to be applied (Note: the syntax used is Perl's regular expression substitution, where use of parentheses form capture groups).  If it does match, then the third element is what the matching metadata value is transformed into (groups formed with brackets from the source metadata matching term can be referenced as $1, $2 and so on). The fourth entry specifies any modifiers for the substitution, such as 'g' for global and 'i' for case-insensitive.  The fifth entry specifies the metadata name that is set with the newly created value.  The rules are applied in the order they are provided in the comma-separated value file, so it is permissible for metadata set by one of the earlier rules to then be used in a later matching rule. Destination metadata names that start '_transient' are not stored in the final document.  For an example of a metadata_mapping_file, refer to the one provided in GSDLHOME/etc/metadta_mapping_rules.csv
     812
    811813BaseImporter.OIDtype.auto:Use OIDtype set in import.pl
    812814
Note: See TracChangeset for help on using the changeset viewer.