Changeset 34249


Ignore:
Timestamp:
2020-07-09T09:38:42+12:00 (4 years ago)
Author:
ak19
Message:

Dr Bainbridge in his commit 32810 had expressed that he intended to commit his MetadataCSVPlugin related work for dlheritage to the main GS after the then upcoming GS3 release. His plugin changes support multiple values for a metadata field work and these changes for me in the GS3tutorials collection that uses a metadata.csv file. Like dlheritage, I also use the pipe symbol to separate multiple meta values for a meta field/column. Kathy had made a bugfix to MetadataCSVPlugin since Dr Bainbridge's branched the code off for dlheritage. I will incorporate her bugfix into Dr Bainbridge's work and test things still work and will commit that separately next. Committing from uni machine, as something weird about WMTB VM where I tested these plugin changes and additions: svn committing hasn't been working for a few days now but freezes trying to transmit data.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm

    r33389 r34249  
    2727package CSVPlugin;
    2828
    29 
    3029use SplitTextFile;
    3130use MetadataRead;
     31use CSVFieldSeparator;
     32
    3233use strict;
    3334no strict 'refs'; # allow filehandles to be variables and viceversa
    3435
     36use Text::CSV;
    3537
    3638# CSVPlugin is a sub-class of SplitTextFile.
    3739sub BEGIN {
    38     @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
     40    @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator');
    3941}
    4042
    4143
    4244my $arguments =
    43     [ { 'name' => "process_exp",
     45    [
     46      { 'name' => "process_exp",
    4447    'desc' => "{BaseImporter.process_exp}",
    4548    'type' => "regexp",
     
    8487    push(@{$hashArgOptLists->{"OptList"}}, $options);
    8588
     89    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
    8690    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
    8791
     
    110114    $$textref =~ s/^(.*?)\r?\n//;
    111115    my @csv_file_fields = ();
    112     my $csv_file_field_line = $1 . ",";  # To make the regular expressions simpler
    113     while ($csv_file_field_line ne "") {
    114     # Handle quoted values
    115     if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
    116         my $csv_file_field = $1;
    117         $csv_file_field =~ s/ //g;  # Remove any spaces from the field names
    118         push(@csv_file_fields, $csv_file_field);
    119     }
    120     # Normal comma-separated case
    121     elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
    122         my $csv_file_field = $1;
    123         $csv_file_field =~ s/ //g;  # Remove any spaces from the field names
    124         push(@csv_file_fields, $csv_file_field);
    125     }
    126     # The line must be formatted incorrectly
    127     else {
    128         print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
    129         last;
    130     }
     116    my $csv_file_field_line = $1;
     117
     118    my $separate_char = $self->{'csv_field_separator'};
     119    if ($separate_char =~ m/^auto$/i) {
     120    $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
     121    # Replace the 'auto' setting the resolved value (for use later on)
     122    $self->{'separate_char'} = $separate_char;
    131123    }
    132124
    133     $self->{'csv_file_fields'}->{$filename} = \@csv_file_fields;
    134     ###print STDERR "**** CSV file fields joined ($filename) = ", join(" ||| ", @{$self->{'csv_file_fields'}->{$filename}}), "\n";
     125    my $csv = Text::CSV->new();
     126    $csv->sep_char($separate_char);
     127 
     128    if ($csv->parse($csv_file_field_line)) {
     129    @csv_file_fields = $csv->fields;
     130    }
     131    else {
     132    print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
     133    }
     134
     135    $self->{'csv_file_fields'} = \@csv_file_fields;
     136   
     137    # print STDERR "**** CSV file fields joined = ", join(" ||| ", @{$self->{'csv_file_fields'}}), "\n";
    135138}
    136139
     
    144147    my $section = $doc_obj->get_top_section();
    145148    my $csv_line = $$textref;
    146 
    147     my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
    148     my @csv_file_fields = @{$self->{'csv_file_fields'}->{$filename_full_path}};
    149 
    150     ###print STDERR "**** CSV file fields joined = ", join(" ||| ", @csv_file_fields), "\n";
    151 
    152      # Add the raw line as the document text
     149    my @csv_file_fields = @{$self->{'csv_file_fields'}};   
     150   
     151    # Add the raw line as the document text
    153152    $doc_obj->add_utf8_text($section, $csv_line);
    154153
     154    my $separate_char = $self->{'separate_char'};
     155
     156    my $md_val_sep = $self->{'metadata_value_separator'};
     157    undef $md_val_sep if ($md_val_sep eq "");
     158
     159    my $csv = Text::CSV->new();
     160    $csv->sep_char($separate_char);
     161 
    155162    # Build a hash of metadata name to metadata value for this line
    156     my $i = 0;
    157     $csv_line .= ",";  # To make the regular expressions simpler
    158     while ($csv_line ne "") {
    159     # Metadata values containing commas are quoted
    160     if ($csv_line =~ s/^\"(.*?)\"\,//) {
     163    if ($csv->parse($csv_line)) {
     164    my @md_vals = $csv->fields;
     165    my $md_vals_len = scalar(@md_vals);
     166
     167    for (my $i=0; $i<$md_vals_len; $i++) {
     168        my $md_val = $md_vals[$i];
    161169        # Only bother with non-empty values
    162         if ($1 ne "" && defined($csv_file_fields[$i])) {
    163         $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
     170        if ($md_val ne "" && defined($csv_file_fields[$i])) {
     171        if (defined $md_val_sep) {
     172            my $md_name = $csv_file_fields[$i];
     173
     174            my @within_md_vals = split(/${md_val_sep}/,$md_val);
     175            foreach my $within_md_val (@within_md_vals) {
     176            $doc_obj->add_utf8_metadata($section, $md_name, $within_md_val);
     177            }
     178        }
     179        else {
     180            $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
     181        }
    164182        }
    165183    }
    166     # Normal comma-separated case
    167     elsif ($csv_line =~ s/^(.*?)\,//) {
    168         # Only bother with non-empty values
    169         if ($1 ne "" && defined($csv_file_fields[$i])) {
    170         $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
    171         }
    172     }
    173     # The line must be formatted incorrectly
    174     else {
    175         print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
    176         last;
    177     }
    178 
    179     $i++;
     184    }
     185    else {
     186    print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
    180187    }
    181188
  • main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm

    r32501 r34249  
    3030use BaseImporter;
    3131use MetadataRead;
     32use CSVFieldSeparator;
    3233
    3334use strict;
     
    3940
    4041use Encode;
     42use Text::CSV;
    4143
    4244# methods with identical signatures take precedence in the order given in the ISA list.
    4345sub BEGIN {
    44     @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter');
    45 }
     46    @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator');
     47}
     48
    4649
    4750
     
    7275    push(@{$hashArgOptLists->{"OptList"}},$options);
    7376
     77    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
    7478    my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
    7579
     
    108112    $processor, $gli, $aux) = @_;
    109113
    110     # Read metadata from CSV files
    111     my $filename = &util::filename_cat($base_dir, $file);
    112     if ($filename !~ /\.csv$/ || !-f $filename) {
    113     return undef;
    114     }
     114    # can we process this file??
     115    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     116    return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
     117   
    115118    print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
    116119    print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
     
    120123
    121124    # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
    122     $self->block_raw_filename($block_hash,$filename);
     125    $self->block_raw_filename($block_hash,$filename_full_path);
    123126
    124127
    125128    # Read the CSV file to get the metadata
    126129    my $csv_file_content;
    127     open(CSV_FILE, "$filename");
     130    open(CSV_FILE, "$filename_full_path");
    128131    my $csv_file_reader = new multiread();
    129132    $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
     
    140143    $csv_file_content =~ s/\r/\n/g;  # Handle non-Unix line endings
    141144    $csv_file_content =~ s/\n+/\n/g;
     145
     146    my $separate_char = $self->{'csv_field_separator'};
     147
     148    my $md_val_sep = $self->{'metadata_value_separator'};
     149    undef $md_val_sep if ($md_val_sep eq "");
     150
    142151    my @csv_file_lines = split(/\n/, $csv_file_content);
    143152    my $csv_file_field_line = shift(@csv_file_lines);
    144     my @csv_file_fields = split(/\,/, $csv_file_field_line);
     153
     154    if ($separate_char =~ m/^auto$/i) {
     155    $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
     156    }
     157
     158    my $csv = Text::CSV->new();
     159    $csv->sep_char($separate_char);
     160 
     161    my @csv_file_fields = undef;
     162    if ($csv->parse($csv_file_field_line)) {
     163    @csv_file_fields = $csv->fields;
     164    }
     165    else {
     166    $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
     167    return -1;
     168    }
     169
    145170    my $found_filename_field = 0;
    146171    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
     
    156181
    157182    if (!$found_filename_field) {
    158     $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
     183    $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file");
    159184    return -1; # error
    160185    }
     
    164189    next if ($csv_line =~ /^\s*$/);
    165190    my $orig_csv_line = $csv_line;
     191
    166192    # Build a hash of metadata name to metadata value for this line
    167193    my %csv_line_metadata;
    168     my $i = 0;
    169     $csv_line .= ",";  # To make the regular expressions simpler
    170     while ($csv_line ne "") {
    171         # Metadata values containing commas are quoted
    172         if ($csv_line =~ s/^\"(.*?)\"\,//) {
     194
     195    if ($csv->parse($csv_line)) {
     196        my @md_vals = $csv->fields;
     197        my $md_vals_len = scalar(@md_vals);
     198       
     199        for (my $i=0; $i<$md_vals_len; $i++) {
     200        my $md_val = $md_vals[$i];
    173201        # Only bother with non-empty values
    174         if ($1 ne "" && defined($csv_file_fields[$i])) {
    175             if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
    176             $csv_line_metadata{$csv_file_fields[$i]} = [];
     202        if ($md_val ne "" && defined($csv_file_fields[$i])) {
     203
     204            my $md_name = $csv_file_fields[$i];
     205
     206            if (!defined $md_name) {
     207            $csv_line_metadata{$md_name} = [];
    177208            }
    178             push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
     209
     210            if (defined $md_val_sep) {         
     211           
     212            my @within_md_vals = split(/${md_val_sep}/,$md_val);
     213            #push (@{$csv_line_metadata{$md_name}}, @within_md_vals);
     214
     215            # protect square brackets in metadata values by hex entity encoding them
     216            # As unescaped square bracket chars in metadata
     217            # have special meaning in GS' Java runtime code
     218            my @escaped_within_md_vals = ();
     219            for my $meta_value (@within_md_vals) {
     220                $meta_value =~ s/\[/&\#091;/g;
     221                $meta_value =~ s/\]/&\#093;/g;
     222                push(@escaped_within_md_vals, $meta_value);
     223            }
     224            push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
     225
     226#           foreach my $within_md_val (@within_md_vals) {
     227#               push (@{$csv_line_metadata{$md_name}}, $within_md_val);
     228#           }
     229            }
     230            else {
     231            #push (@{$csv_line_metadata{$md_name}}, $md_val);
     232            # protect square brackets in metadata values by hex entity encoding them
     233            my $escaped_metadata_value = $md_val;
     234            $escaped_metadata_value =~ s/\[/&\#091;/g;
     235            $escaped_metadata_value =~ s/\]/&\#093;/g;
     236            push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
     237            }
    179238        }
    180239        }
    181         # Normal comma-separated case
    182         elsif ($csv_line =~ s/^(.*?)\,//) {
    183         # Only bother with non-empty values
    184         if ($1 ne "" && defined($csv_file_fields[$i])) {
    185             if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
    186             $csv_line_metadata{$csv_file_fields[$i]} = [];
    187             }
    188             # remove any surrounding quotes. (When exporting to CSV, some spreadsheet
    189             # programs add quotes even around field values that don't contain commas.)
    190             my $value = $1;
    191             $value =~ s/^"//;
    192             $value =~ s/"$//;
    193             push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $value);
    194         }
    195         }
    196         # The line must be formatted incorrectly
    197         else {
    198         $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
    199         last;
    200         }
    201 
    202         $i++;
     240    }
     241    else {
     242        $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line");
     243        last;
    203244    }
    204245
     
    206247    my $csv_line_filename_array = $csv_line_metadata{"Filename"};
    207248    if (!defined $csv_line_filename_array) {
    208         $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
     249        $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line");
    209250        next;
    210251    }
     
    255296    }
    256297    # maps the file to full path
    257     &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename);
     298    &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename_full_path);
    258299    }
    259300}
Note: See TracChangeset for help on using the changeset viewer.