Changeset 38742 for main/trunk
- Timestamp:
- 2024-02-13T17:15:40+13:00 (3 months ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/BaseImporter.pm
r37048 r38742 99 99 'type' => "string", 100 100 'reqd' => "no" }, 101 { 'name' => "metadata_mapping_file", 102 'desc' => "{BaseImporter.metadata_mapping_file}", 103 'type' => "string", 104 #'deft' => "metadata_mapping_rules.csv", # commenting out, explicitly giving this as the default means every plugin automatically activates it 105 'deft' => "", 106 'reqd' => "no" }, 101 107 { 'name' => "OIDtype", 102 108 'desc' => "{import.OIDtype}", … … 155 161 'args' => $arguments }; 156 162 163 sub setup_metadata_mapping_rules 164 { 165 my $self = shift (@_); 166 my ($metadata_mapping_full_filename) = @_; 167 168 my $csv_file = &util::read_utf8_textfile($metadata_mapping_full_filename); 169 170 my @csv_lines = split('\n',$csv_file); 171 172 my $line_num = 0; 173 174 my $metadata_mapping_rules = []; 175 176 foreach my $csv_line (@csv_lines) { 177 chomp($csv_line); 178 179 $line_num++; 180 181 next if ($csv_line =~ m/^\s*$/); 182 next if ($csv_line =~ m/^#/); 183 184 # my @tsv_entries = split('\t',$tsv_line); 185 my @csv_entries = split(/\s*,\s*/,$csv_line); 186 187 my $num_csv_entries = scalar(@csv_entries); 188 189 # print STDERR join(",",@tsv_entries), "\n\n"; 190 191 if (($num_csv_entries == 5) || ($num_csv_entries >= 6 && $csv_entries[5] =~ m/^#/)) { 192 my $src_metadata_name = $csv_entries[0]; 193 (my $src_metadata_regex = $csv_entries[1]) =~ s/(^\/)|(\/$)//g; #assigns as a /qr (quoted regex) and then strips off leading and trailing slash 194 (my $dst_metadata_regex = $csv_entries[2]) =~ s/(^\/)|(\/$)//g; #as line above 195 (my $regex_modifiers = $csv_entries[3]) =~ s/(^')|('$)//g; #similar to above but strips out single quotes 196 my $dst_metadata_name = $csv_entries[4]; 197 198 # If the last entry has a '# comments' after it, needs a bit more string manipulation 199 # to tidy it up 200 $dst_metadata_name =~ s/#.*$//; 201 $dst_metadata_name =~ s/\s*$//; 202 203 my $regex_sub = "s/$src_metadata_regex/$dst_metadata_regex/$regex_modifiers"; 204 205 my $metadata_mapping_rule_rec = { 206 "src_metadata_name" => $src_metadata_name, 207 "regex_sub" => $regex_sub, 208 "dst_metadata_name" => $dst_metadata_name 209 }; 210 211 push(@$metadata_mapping_rules,$metadata_mapping_rule_rec); 212 } 213 else { 214 print STDERR "Warning: syntax error in $metadata_mapping_full_filename, line $line_num\n"; 215 print STDERR " $csv_line\n"; 216 print STDERR "Did not contain 5 tab-delimited entries\n"; 217 } 218 219 } 220 221 if (scalar(@$metadata_mapping_rules)>0) { 222 $self->{'metadata_mapping_rules'} = $metadata_mapping_rules; 223 } 224 } 225 157 226 sub new { 158 227 … … 165 234 166 235 my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary); 167 236 168 237 if ($self->{'info_only'}) { 169 238 # don't worry about any options etc … … 205 274 } 206 275 207 return bless $self, $class; 276 my $blessed_self = bless $self, $class; 277 278 if ($blessed_self->{'metadata_mapping_file'}) { 279 my $metadata_mapping_file = $blessed_self->{'metadata_mapping_file'}; 280 my $mmf_full_filename; 281 $mmf_full_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $metadata_mapping_file); 282 if (!-e $mmf_full_filename) { 283 my $mmf_full_col_filename = $mmf_full_filename; 284 $mmf_full_filename = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $metadata_mapping_file); 285 286 if (!-e $mmf_full_filename) { 287 my $outhandle = $blessed_self->{'outhandle'}; 288 print STDERR "\nBaseImporter Error: Can't locate metadata_mapping_file '$metadata_mapping_file'\n"; 289 print STDERR "This file should be in $mmf_full_col_filename or $mmf_full_filename\n"; 290 $blessed_self->print_txt_usage(""); # Use default resource bundle 291 print STDERR "\nBaseImporter Error: Can't locate metadata_mapping_file '$metadata_mapping_file'\n"; 292 print STDERR "This file should be in $mmf_full_col_filename or $mmf_full_filename\n"; 293 die "\n"; 294 } 295 } 296 297 $blessed_self->{'metadata_mapping_full_filename'} = $mmf_full_filename; 298 $blessed_self->setup_metadata_mapping_rules($mmf_full_filename); 299 } 300 301 302 return $blessed_self; 208 303 209 304 } … … 688 783 } 689 784 785 786 sub apply_metadata_mapping_file { 787 my $self = shift (@_); 788 my ($doc_obj) = @_; 789 790 my $verbosity = $self->{'verbosity'}; 791 my $outhandle = $self->{'outhandle'}; 792 793 my $top_section = $doc_obj->get_top_section(); 794 795 my $metadata_mapping_rules = $self->{'metadata_mapping_rules'}; 796 797 my $transient_metadata_names = {}; 798 my $new_metadata_names = {}; 799 800 print $outhandle " Applying metadata_mapping_file to document\n"; 801 802 foreach my $metadata_mapping_rule_rec (@$metadata_mapping_rules) { 803 804 my $src_metadata_name = $metadata_mapping_rule_rec->{"src_metadata_name"}; 805 my $dst_metadata_name = $metadata_mapping_rule_rec->{"dst_metadata_name"}; 806 807 my $regex_sub = $metadata_mapping_rule_rec->{"regex_sub"}; 808 809 my $metadata_vals = $doc_obj->get_metadata($top_section,$src_metadata_name); 810 811 foreach my $metadata_val (@$metadata_vals) { 812 813 my $store_metadata_val = $metadata_val; 814 815 if ($verbosity >= 4) { 816 print $outhandle " Testing for match with: \$metadata_val =~ $regex_sub\n" 817 } 818 eval ( "\$metadata_val =~ $regex_sub" ); 819 if ($@) { 820 warn "$@"; 821 } 822 else { 823 # print STDERR "**** after metadata_val = $metadata_val\n\n"; 824 if ($verbosity >=2) { 825 if ($metadata_val ne $store_metadata_val) { 826 827 if ($verbosity >= 3) { 828 print $outhandle " Transformed metadata to $dst_metadata_name: '$store_metadata_val' -> '$metadata_val'\n" 829 } 830 if ($verbosity >= 2 && ($dst_metadata_name !~ m/^_transient/)) { 831 print $outhandle " Added new metadata $dst_metadata_name: '$metadata_val'\n" 832 } 833 } 834 } 835 $doc_obj->add_utf8_metadata($top_section,$dst_metadata_name,$metadata_val); 836 837 if ($dst_metadata_name =~ m/^_transient/) { 838 $transient_metadata_names->{$dst_metadata_name} = 1; 839 } 840 else { 841 if ($metadata_val ne $store_metadata_val) { 842 $new_metadata_names->{$dst_metadata_name} = 1; 843 } 844 } 845 } 846 } 847 848 } 849 850 if ($verbosity >=1) { 851 print $outhandle " Number of metadata_mapping_file transformations for this document: ", scalar(keys %$new_metadata_names), "\n"; 852 853 if ($verbosity >=2) { 854 print $outhandle " Generated metadata names: ", join(", ",keys %$new_metadata_names), "\n"; 855 } 856 } 857 858 # Now remove any of dst_metadata_name which began _transient 859 foreach my $transient_metadata_name (keys %$transient_metadata_names) { 860 #print STDERR "transient_metadata_name = $transient_metadata_name\n"; 861 $doc_obj->delete_metadata($top_section,$transient_metadata_name); 862 } 863 } 864 690 865 sub post_process_doc_obj { 691 866 my $self = shift (@_); 692 867 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 693 868 869 if ($self->{'metadata_mapping_rules'}) { 870 $self->apply_metadata_mapping_file($doc_obj); 871 } 872 694 873 return 1; 695 874 } … … 820 999 return undef unless $self->can_process_this_file($filename_full_path); 821 1000 822 1001 #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n"; 823 1002 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_); 824 1003 #print STDERR "**** AFTER READ INTO DOC OBJ: $file\n"; -
main/trunk/greenstone2/perllib/strings.properties
r38736 r38742 809 809 BaseImporter.no_cover_image:Do not look for a prefix.jpg file (where prefix is the same prefix as the file being processed) to associate as a cover image. 810 810 811 BaseImporter.metadata_mapping_file:Use the specified metadata mapping file to generate additional metadata for a document. The specified comma-separated value file (csv) needs to be encoded as UTF8, and consists of a series of rules, with 5 entries per line. The first entry in the line specifyies a source metadata value to select from the doucment being process, and the second entry is a regular expression the metadata must match for the rule to be applied (Note: the syntax used is Perl's regular expression substitution, where use of parentheses form capture groups). If it does match, then the third element is what the matching metadata value is transformed into (groups formed with brackets from the source metadata matching term can be referenced as $1, $2 and so on). The fourth entry specifies any modifiers for the substitution, such as 'g' for global and 'i' for case-insensitive. The fifth entry specifies the metadata name that is set with the newly created value. The rules are applied in the order they are provided in the comma-separated value file, so it is permissible for metadata set by one of the earlier rules to then be used in a later matching rule. Destination metadata names that start '_transient' are not stored in the final document. For an example of a metadata_mapping_file, refer to the one provided in GSDLHOME/etc/metadta_mapping_rules.csv 812 811 813 BaseImporter.OIDtype.auto:Use OIDtype set in import.pl 812 814
Note:
See TracChangeset
for help on using the changeset viewer.