Ignore:
Timestamp:
2020-07-09T09:38:42+12:00 (4 years ago)
Author:
ak19
Message:

Dr Bainbridge in his commit 32810 had expressed that he intended to commit his MetadataCSVPlugin related work for dlheritage to the main GS after the then upcoming GS3 release. His plugin changes support multiple values for a metadata field work and these changes for me in the GS3tutorials collection that uses a metadata.csv file. Like dlheritage, I also use the pipe symbol to separate multiple meta values for a meta field/column. Kathy had made a bugfix to MetadataCSVPlugin since Dr Bainbridge's branched the code off for dlheritage. I will incorporate her bugfix into Dr Bainbridge's work and test things still work and will commit that separately next. Committing from uni machine, as something weird about WMTB VM where I tested these plugin changes and additions: svn committing hasn't been working for a few days now but freezes trying to transmit data.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm

    r33389 r34249  
    2727package CSVPlugin;
    2828
    29 
    3029use SplitTextFile;
    3130use MetadataRead;
     31use CSVFieldSeparator;
     32
    3233use strict;
    3334no strict 'refs'; # allow filehandles to be variables and viceversa
    3435
     36use Text::CSV;
    3537
    3638# CSVPlugin is a sub-class of SplitTextFile.
    3739sub BEGIN {
    38     @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
     40    @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator');
    3941}
    4042
    4143
    4244my $arguments =
    43     [ { 'name' => "process_exp",
     45    [
     46      { 'name' => "process_exp",
    4447    'desc' => "{BaseImporter.process_exp}",
    4548    'type' => "regexp",
     
    8487    push(@{$hashArgOptLists->{"OptList"}}, $options);
    8588
     89    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
    8690    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
    8791
     
    110114    $$textref =~ s/^(.*?)\r?\n//;
    111115    my @csv_file_fields = ();
    112     my $csv_file_field_line = $1 . ",";  # To make the regular expressions simpler
    113     while ($csv_file_field_line ne "") {
    114     # Handle quoted values
    115     if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
    116         my $csv_file_field = $1;
    117         $csv_file_field =~ s/ //g;  # Remove any spaces from the field names
    118         push(@csv_file_fields, $csv_file_field);
    119     }
    120     # Normal comma-separated case
    121     elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
    122         my $csv_file_field = $1;
    123         $csv_file_field =~ s/ //g;  # Remove any spaces from the field names
    124         push(@csv_file_fields, $csv_file_field);
    125     }
    126     # The line must be formatted incorrectly
    127     else {
    128         print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
    129         last;
    130     }
     116    my $csv_file_field_line = $1;
     117
     118    my $separate_char = $self->{'csv_field_separator'};
     119    if ($separate_char =~ m/^auto$/i) {
     120    $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
     121    # Replace the 'auto' setting the resolved value (for use later on)
     122    $self->{'separate_char'} = $separate_char;
    131123    }
    132124
    133     $self->{'csv_file_fields'}->{$filename} = \@csv_file_fields;
    134     ###print STDERR "**** CSV file fields joined ($filename) = ", join(" ||| ", @{$self->{'csv_file_fields'}->{$filename}}), "\n";
     125    my $csv = Text::CSV->new();
     126    $csv->sep_char($separate_char);
     127 
     128    if ($csv->parse($csv_file_field_line)) {
     129    @csv_file_fields = $csv->fields;
     130    }
     131    else {
     132    print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
     133    }
     134
     135    $self->{'csv_file_fields'} = \@csv_file_fields;
     136   
     137    # print STDERR "**** CSV file fields joined = ", join(" ||| ", @{$self->{'csv_file_fields'}}), "\n";
    135138}
    136139
     
    144147    my $section = $doc_obj->get_top_section();
    145148    my $csv_line = $$textref;
    146 
    147     my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
    148     my @csv_file_fields = @{$self->{'csv_file_fields'}->{$filename_full_path}};
    149 
    150     ###print STDERR "**** CSV file fields joined = ", join(" ||| ", @csv_file_fields), "\n";
    151 
    152      # Add the raw line as the document text
     149    my @csv_file_fields = @{$self->{'csv_file_fields'}};   
     150   
     151    # Add the raw line as the document text
    153152    $doc_obj->add_utf8_text($section, $csv_line);
    154153
     154    my $separate_char = $self->{'separate_char'};
     155
     156    my $md_val_sep = $self->{'metadata_value_separator'};
     157    undef $md_val_sep if ($md_val_sep eq "");
     158
     159    my $csv = Text::CSV->new();
     160    $csv->sep_char($separate_char);
     161 
    155162    # Build a hash of metadata name to metadata value for this line
    156     my $i = 0;
    157     $csv_line .= ",";  # To make the regular expressions simpler
    158     while ($csv_line ne "") {
    159     # Metadata values containing commas are quoted
    160     if ($csv_line =~ s/^\"(.*?)\"\,//) {
     163    if ($csv->parse($csv_line)) {
     164    my @md_vals = $csv->fields;
     165    my $md_vals_len = scalar(@md_vals);
     166
     167    for (my $i=0; $i<$md_vals_len; $i++) {
     168        my $md_val = $md_vals[$i];
    161169        # Only bother with non-empty values
    162         if ($1 ne "" && defined($csv_file_fields[$i])) {
    163         $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
     170        if ($md_val ne "" && defined($csv_file_fields[$i])) {
     171        if (defined $md_val_sep) {
     172            my $md_name = $csv_file_fields[$i];
     173
     174            my @within_md_vals = split(/${md_val_sep}/,$md_val);
     175            foreach my $within_md_val (@within_md_vals) {
     176            $doc_obj->add_utf8_metadata($section, $md_name, $within_md_val);
     177            }
     178        }
     179        else {
     180            $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
     181        }
    164182        }
    165183    }
    166     # Normal comma-separated case
    167     elsif ($csv_line =~ s/^(.*?)\,//) {
    168         # Only bother with non-empty values
    169         if ($1 ne "" && defined($csv_file_fields[$i])) {
    170         $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
    171         }
    172     }
    173     # The line must be formatted incorrectly
    174     else {
    175         print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
    176         last;
    177     }
    178 
    179     $i++;
     184    }
     185    else {
     186    print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
    180187    }
    181188
Note: See TracChangeset for help on using the changeset viewer.