Ignore:
Timestamp:
2020-07-09T09:38:42+12:00 (4 years ago)
Author:
ak19
Message:

Dr Bainbridge in his commit 32810 had expressed that he intended to commit his MetadataCSVPlugin related work for dlheritage to the main GS after the then upcoming GS3 release. His plugin changes support multiple values for a metadata field work and these changes for me in the GS3tutorials collection that uses a metadata.csv file. Like dlheritage, I also use the pipe symbol to separate multiple meta values for a meta field/column. Kathy had made a bugfix to MetadataCSVPlugin since Dr Bainbridge's branched the code off for dlheritage. I will incorporate her bugfix into Dr Bainbridge's work and test things still work and will commit that separately next. Committing from uni machine, as something weird about WMTB VM where I tested these plugin changes and additions: svn committing hasn't been working for a few days now but freezes trying to transmit data.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm

    r32501 r34249  
    3030use BaseImporter;
    3131use MetadataRead;
     32use CSVFieldSeparator;
    3233
    3334use strict;
     
    3940
    4041use Encode;
     42use Text::CSV;
    4143
    4244# methods with identical signatures take precedence in the order given in the ISA list.
    4345sub BEGIN {
    44     @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter');
    45 }
     46    @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator');
     47}
     48
    4649
    4750
     
    7275    push(@{$hashArgOptLists->{"OptList"}},$options);
    7376
     77    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
    7478    my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
    7579
     
    108112    $processor, $gli, $aux) = @_;
    109113
    110     # Read metadata from CSV files
    111     my $filename = &util::filename_cat($base_dir, $file);
    112     if ($filename !~ /\.csv$/ || !-f $filename) {
    113     return undef;
    114     }
     114    # can we process this file??
     115    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     116    return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
     117   
    115118    print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
    116119    print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
     
    120123
    121124    # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
    122     $self->block_raw_filename($block_hash,$filename);
     125    $self->block_raw_filename($block_hash,$filename_full_path);
    123126
    124127
    125128    # Read the CSV file to get the metadata
    126129    my $csv_file_content;
    127     open(CSV_FILE, "$filename");
     130    open(CSV_FILE, "$filename_full_path");
    128131    my $csv_file_reader = new multiread();
    129132    $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
     
    140143    $csv_file_content =~ s/\r/\n/g;  # Handle non-Unix line endings
    141144    $csv_file_content =~ s/\n+/\n/g;
     145
     146    my $separate_char = $self->{'csv_field_separator'};
     147
     148    my $md_val_sep = $self->{'metadata_value_separator'};
     149    undef $md_val_sep if ($md_val_sep eq "");
     150
    142151    my @csv_file_lines = split(/\n/, $csv_file_content);
    143152    my $csv_file_field_line = shift(@csv_file_lines);
    144     my @csv_file_fields = split(/\,/, $csv_file_field_line);
     153
     154    if ($separate_char =~ m/^auto$/i) {
     155    $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
     156    }
     157
     158    my $csv = Text::CSV->new();
     159    $csv->sep_char($separate_char);
     160 
     161    my @csv_file_fields = undef;
     162    if ($csv->parse($csv_file_field_line)) {
     163    @csv_file_fields = $csv->fields;
     164    }
     165    else {
     166    $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
     167    return -1;
     168    }
     169
    145170    my $found_filename_field = 0;
    146171    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
     
    156181
    157182    if (!$found_filename_field) {
    158     $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
     183    $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file");
    159184    return -1; # error
    160185    }
     
    164189    next if ($csv_line =~ /^\s*$/);
    165190    my $orig_csv_line = $csv_line;
     191
    166192    # Build a hash of metadata name to metadata value for this line
    167193    my %csv_line_metadata;
    168     my $i = 0;
    169     $csv_line .= ",";  # To make the regular expressions simpler
    170     while ($csv_line ne "") {
    171         # Metadata values containing commas are quoted
    172         if ($csv_line =~ s/^\"(.*?)\"\,//) {
     194
     195    if ($csv->parse($csv_line)) {
     196        my @md_vals = $csv->fields;
     197        my $md_vals_len = scalar(@md_vals);
     198       
     199        for (my $i=0; $i<$md_vals_len; $i++) {
     200        my $md_val = $md_vals[$i];
    173201        # Only bother with non-empty values
    174         if ($1 ne "" && defined($csv_file_fields[$i])) {
    175             if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
    176             $csv_line_metadata{$csv_file_fields[$i]} = [];
     202        if ($md_val ne "" && defined($csv_file_fields[$i])) {
     203
     204            my $md_name = $csv_file_fields[$i];
     205
     206            if (!defined $md_name) {
     207            $csv_line_metadata{$md_name} = [];
    177208            }
    178             push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
     209
     210            if (defined $md_val_sep) {         
     211           
     212            my @within_md_vals = split(/${md_val_sep}/,$md_val);
     213            #push (@{$csv_line_metadata{$md_name}}, @within_md_vals);
     214
     215            # protect square brackets in metadata values by hex entity encoding them
     216            # As unescaped square bracket chars in metadata
     217            # have special meaning in GS' Java runtime code
     218            my @escaped_within_md_vals = ();
     219            for my $meta_value (@within_md_vals) {
     220                $meta_value =~ s/\[/&\#091;/g;
     221                $meta_value =~ s/\]/&\#093;/g;
     222                push(@escaped_within_md_vals, $meta_value);
     223            }
     224            push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
     225
     226#           foreach my $within_md_val (@within_md_vals) {
     227#               push (@{$csv_line_metadata{$md_name}}, $within_md_val);
     228#           }
     229            }
     230            else {
     231            #push (@{$csv_line_metadata{$md_name}}, $md_val);
     232            # protect square brackets in metadata values by hex entity encoding them
     233            my $escaped_metadata_value = $md_val;
     234            $escaped_metadata_value =~ s/\[/&\#091;/g;
     235            $escaped_metadata_value =~ s/\]/&\#093;/g;
     236            push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
     237            }
    179238        }
    180239        }
    181         # Normal comma-separated case
    182         elsif ($csv_line =~ s/^(.*?)\,//) {
    183         # Only bother with non-empty values
    184         if ($1 ne "" && defined($csv_file_fields[$i])) {
    185             if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
    186             $csv_line_metadata{$csv_file_fields[$i]} = [];
    187             }
    188             # remove any surrounding quotes. (When exporting to CSV, some spreadsheet
    189             # programs add quotes even around field values that don't contain commas.)
    190             my $value = $1;
    191             $value =~ s/^"//;
    192             $value =~ s/"$//;
    193             push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $value);
    194         }
    195         }
    196         # The line must be formatted incorrectly
    197         else {
    198         $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
    199         last;
    200         }
    201 
    202         $i++;
     240    }
     241    else {
     242        $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line");
     243        last;
    203244    }
    204245
     
    206247    my $csv_line_filename_array = $csv_line_metadata{"Filename"};
    207248    if (!defined $csv_line_filename_array) {
    208         $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
     249        $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line");
    209250        next;
    210251    }
     
    255296    }
    256297    # maps the file to full path
    257     &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename);
     298    &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename_full_path);
    258299    }
    259300}
Note: See TracChangeset for help on using the changeset viewer.