Changeset 34249

Show
Ignore:
Timestamp:
09.07.2020 09:38:42 (4 weeks ago)
Author:
ak19
Message:

Dr Bainbridge in his commit 32810 had expressed that he intended to commit his MetadataCSVPlugin related work for dlheritage to the main GS after the then upcoming GS3 release. His plugin changes support multiple values for a metadata field work and these changes for me in the GS3tutorials collection that uses a metadata.csv file. Like dlheritage, I also use the pipe symbol to separate multiple meta values for a meta field/column. Kathy had made a bugfix to MetadataCSVPlugin since Dr Bainbridge's branched the code off for dlheritage. I will incorporate her bugfix into Dr Bainbridge's work and test things still work and will commit that separately next. Committing from uni machine, as something weird about WMTB VM where I tested these plugin changes and additions: svn committing hasn't been working for a few days now but freezes trying to transmit data.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
1 added
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm

    r33389 r34249  
    2727package CSVPlugin; 
    2828 
    29  
    3029use SplitTextFile; 
    3130use MetadataRead; 
     31use CSVFieldSeparator; 
     32 
    3233use strict; 
    3334no strict 'refs'; # allow filehandles to be variables and viceversa 
    3435 
     36use Text::CSV; 
    3537 
    3638# CSVPlugin is a sub-class of SplitTextFile. 
    3739sub BEGIN { 
    38     @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile'); 
     40    @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator'); 
    3941} 
    4042 
    4143 
    4244my $arguments =  
    43     [ { 'name' => "process_exp", 
     45    [  
     46      { 'name' => "process_exp", 
    4447    'desc' => "{BaseImporter.process_exp}", 
    4548    'type' => "regexp", 
     
    8487    push(@{$hashArgOptLists->{"OptList"}}, $options); 
    8588 
     89    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists); 
    8690    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists); 
    8791 
     
    110114    $$textref =~ s/^(.*?)\r?\n//; 
    111115    my @csv_file_fields = (); 
    112     my $csv_file_field_line = $1 . ",";  # To make the regular expressions simpler 
    113     while ($csv_file_field_line ne "") { 
    114     # Handle quoted values 
    115     if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) { 
    116         my $csv_file_field = $1; 
    117         $csv_file_field =~ s/ //g;  # Remove any spaces from the field names 
    118         push(@csv_file_fields, $csv_file_field); 
    119     } 
    120     # Normal comma-separated case 
    121     elsif ($csv_file_field_line =~ s/^(.*?)\,//) { 
    122         my $csv_file_field = $1; 
    123         $csv_file_field =~ s/ //g;  # Remove any spaces from the field names 
    124         push(@csv_file_fields, $csv_file_field); 
    125     } 
    126     # The line must be formatted incorrectly 
    127     else { 
    128         print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n"; 
    129         last; 
    130     } 
     116    my $csv_file_field_line = $1; 
     117 
     118    my $separate_char = $self->{'csv_field_separator'}; 
     119    if ($separate_char =~ m/^auto$/i) { 
     120    $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'}); 
     121    # Replace the 'auto' setting the resolved value (for use later on) 
     122    $self->{'separate_char'} = $separate_char; 
    131123    } 
    132124 
    133     $self->{'csv_file_fields'}->{$filename} = \@csv_file_fields; 
    134     ###print STDERR "**** CSV file fields joined ($filename) = ", join(" ||| ", @{$self->{'csv_file_fields'}->{$filename}}), "\n"; 
     125    my $csv = Text::CSV->new(); 
     126    $csv->sep_char($separate_char); 
     127  
     128    if ($csv->parse($csv_file_field_line)) { 
     129    @csv_file_fields = $csv->fields; 
     130    } 
     131    else { 
     132    print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n"; 
     133    } 
     134 
     135    $self->{'csv_file_fields'} = \@csv_file_fields; 
     136     
     137    # print STDERR "**** CSV file fields joined = ", join(" ||| ", @{$self->{'csv_file_fields'}}), "\n"; 
    135138} 
    136139 
     
    144147    my $section = $doc_obj->get_top_section(); 
    145148    my $csv_line = $$textref; 
    146  
    147     my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file); 
    148     my @csv_file_fields = @{$self->{'csv_file_fields'}->{$filename_full_path}}; 
    149  
    150     ###print STDERR "**** CSV file fields joined = ", join(" ||| ", @csv_file_fields), "\n"; 
    151  
    152      # Add the raw line as the document text 
     149    my @csv_file_fields = @{$self->{'csv_file_fields'}};     
     150     
     151    # Add the raw line as the document text 
    153152    $doc_obj->add_utf8_text($section, $csv_line); 
    154153 
     154    my $separate_char = $self->{'separate_char'}; 
     155 
     156    my $md_val_sep = $self->{'metadata_value_separator'}; 
     157    undef $md_val_sep if ($md_val_sep eq ""); 
     158 
     159    my $csv = Text::CSV->new(); 
     160    $csv->sep_char($separate_char); 
     161  
    155162    # Build a hash of metadata name to metadata value for this line 
    156     my $i = 0; 
    157     $csv_line .= ",";  # To make the regular expressions simpler 
    158     while ($csv_line ne "") { 
    159     # Metadata values containing commas are quoted 
    160     if ($csv_line =~ s/^\"(.*?)\"\,//) { 
     163    if ($csv->parse($csv_line)) { 
     164    my @md_vals = $csv->fields; 
     165    my $md_vals_len = scalar(@md_vals); 
     166 
     167    for (my $i=0; $i<$md_vals_len; $i++) { 
     168        my $md_val = $md_vals[$i]; 
    161169        # Only bother with non-empty values 
    162         if ($1 ne "" && defined($csv_file_fields[$i])) { 
    163         $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1); 
     170        if ($md_val ne "" && defined($csv_file_fields[$i])) { 
     171        if (defined $md_val_sep) { 
     172            my $md_name = $csv_file_fields[$i]; 
     173 
     174            my @within_md_vals = split(/${md_val_sep}/,$md_val); 
     175            foreach my $within_md_val (@within_md_vals) { 
     176            $doc_obj->add_utf8_metadata($section, $md_name, $within_md_val); 
     177            } 
     178        } 
     179        else { 
     180            $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val); 
     181        } 
    164182        } 
    165183    } 
    166     # Normal comma-separated case 
    167     elsif ($csv_line =~ s/^(.*?)\,//) { 
    168         # Only bother with non-empty values 
    169         if ($1 ne "" && defined($csv_file_fields[$i])) { 
    170         $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1); 
    171         } 
    172     } 
    173     # The line must be formatted incorrectly 
    174     else { 
    175         print STDERR "Error: Badly formatted CSV line: $csv_line.\n"; 
    176         last; 
    177     } 
    178  
    179     $i++; 
     184    } 
     185    else { 
     186    print STDERR "Error: Badly formatted CSV line: $csv_line.\n"; 
    180187    } 
    181188 
  • main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm

    r32501 r34249  
    3030use BaseImporter; 
    3131use MetadataRead; 
     32use CSVFieldSeparator; 
    3233 
    3334use strict; 
     
    3940 
    4041use Encode; 
     42use Text::CSV; 
    4143 
    4244# methods with identical signatures take precedence in the order given in the ISA list. 
    4345sub BEGIN { 
    44     @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter'); 
    45 } 
     46    @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator'); 
     47} 
     48 
    4649 
    4750 
     
    7275    push(@{$hashArgOptLists->{"OptList"}},$options); 
    7376 
     77    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists); 
    7478    my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists); 
    7579 
     
    108112    $processor, $gli, $aux) = @_; 
    109113 
    110     # Read metadata from CSV files 
    111     my $filename = &util::filename_cat($base_dir, $file); 
    112     if ($filename !~ /\.csv$/ || !-f $filename) { 
    113     return undef; 
    114     } 
     114    # can we process this file?? 
     115    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
     116    return undef unless $self->can_process_this_file_for_metadata($filename_full_path); 
     117     
    115118    print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli); 
    116119    print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1; 
     
    120123 
    121124    # add the file to the block list so that it won't be processed in read, as we will do all we can with it here 
    122     $self->block_raw_filename($block_hash,$filename); 
     125    $self->block_raw_filename($block_hash,$filename_full_path); 
    123126 
    124127 
    125128    # Read the CSV file to get the metadata 
    126129    my $csv_file_content; 
    127     open(CSV_FILE, "$filename"); 
     130    open(CSV_FILE, "$filename_full_path"); 
    128131    my $csv_file_reader = new multiread(); 
    129132    $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE'); 
     
    140143    $csv_file_content =~ s/\r/\n/g;  # Handle non-Unix line endings 
    141144    $csv_file_content =~ s/\n+/\n/g; 
     145 
     146    my $separate_char = $self->{'csv_field_separator'}; 
     147 
     148    my $md_val_sep = $self->{'metadata_value_separator'}; 
     149    undef $md_val_sep if ($md_val_sep eq ""); 
     150 
    142151    my @csv_file_lines = split(/\n/, $csv_file_content); 
    143152    my $csv_file_field_line = shift(@csv_file_lines); 
    144     my @csv_file_fields = split(/\,/, $csv_file_field_line); 
     153 
     154    if ($separate_char =~ m/^auto$/i) { 
     155    $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'}); 
     156    } 
     157 
     158    my $csv = Text::CSV->new(); 
     159    $csv->sep_char($separate_char); 
     160  
     161    my @csv_file_fields = undef; 
     162    if ($csv->parse($csv_file_field_line)) { 
     163    @csv_file_fields = $csv->fields; 
     164    } 
     165    else { 
     166    $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line"); 
     167    return -1; 
     168    } 
     169 
    145170    my $found_filename_field = 0; 
    146171    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) { 
     
    156181 
    157182    if (!$found_filename_field) { 
    158     $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file"); 
     183    $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file"); 
    159184    return -1; # error 
    160185    } 
     
    164189    next if ($csv_line =~ /^\s*$/); 
    165190    my $orig_csv_line = $csv_line; 
     191 
    166192    # Build a hash of metadata name to metadata value for this line 
    167193    my %csv_line_metadata; 
    168     my $i = 0; 
    169     $csv_line .= ",";  # To make the regular expressions simpler 
    170     while ($csv_line ne "") { 
    171         # Metadata values containing commas are quoted 
    172         if ($csv_line =~ s/^\"(.*?)\"\,//) { 
     194 
     195    if ($csv->parse($csv_line)) { 
     196        my @md_vals = $csv->fields; 
     197        my $md_vals_len = scalar(@md_vals); 
     198         
     199        for (my $i=0; $i<$md_vals_len; $i++) { 
     200        my $md_val = $md_vals[$i]; 
    173201        # Only bother with non-empty values 
    174         if ($1 ne "" && defined($csv_file_fields[$i])) { 
    175             if (!defined $csv_line_metadata{$csv_file_fields[$i]}) { 
    176             $csv_line_metadata{$csv_file_fields[$i]} = []; 
     202        if ($md_val ne "" && defined($csv_file_fields[$i])) { 
     203 
     204            my $md_name = $csv_file_fields[$i]; 
     205 
     206            if (!defined $md_name) { 
     207            $csv_line_metadata{$md_name} = []; 
    177208            } 
    178             push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1); 
     209 
     210            if (defined $md_val_sep) {           
     211             
     212            my @within_md_vals = split(/${md_val_sep}/,$md_val); 
     213            #push (@{$csv_line_metadata{$md_name}}, @within_md_vals); 
     214 
     215            # protect square brackets in metadata values by hex entity encoding them 
     216            # As unescaped square bracket chars in metadata 
     217            # have special meaning in GS' Java runtime code 
     218            my @escaped_within_md_vals = (); 
     219            for my $meta_value (@within_md_vals) { 
     220                $meta_value =~ s/\[/&\#091;/g; 
     221                $meta_value =~ s/\]/&\#093;/g; 
     222                push(@escaped_within_md_vals, $meta_value); 
     223            } 
     224            push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals); 
     225 
     226#           foreach my $within_md_val (@within_md_vals) { 
     227#               push (@{$csv_line_metadata{$md_name}}, $within_md_val); 
     228#           } 
     229            } 
     230            else { 
     231            #push (@{$csv_line_metadata{$md_name}}, $md_val); 
     232            # protect square brackets in metadata values by hex entity encoding them 
     233            my $escaped_metadata_value = $md_val; 
     234            $escaped_metadata_value =~ s/\[/&\#091;/g; 
     235            $escaped_metadata_value =~ s/\]/&\#093;/g; 
     236            push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value); 
     237            } 
    179238        } 
    180239        } 
    181         # Normal comma-separated case 
    182         elsif ($csv_line =~ s/^(.*?)\,//) { 
    183         # Only bother with non-empty values 
    184         if ($1 ne "" && defined($csv_file_fields[$i])) { 
    185             if (!defined $csv_line_metadata{$csv_file_fields[$i]}) { 
    186             $csv_line_metadata{$csv_file_fields[$i]} = []; 
    187             } 
    188             # remove any surrounding quotes. (When exporting to CSV, some spreadsheet 
    189             # programs add quotes even around field values that don't contain commas.) 
    190             my $value = $1; 
    191             $value =~ s/^"//; 
    192             $value =~ s/"$//; 
    193             push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $value); 
    194         } 
    195         } 
    196         # The line must be formatted incorrectly 
    197         else { 
    198         $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line"); 
    199         last; 
    200         } 
    201  
    202         $i++; 
     240    } 
     241    else { 
     242        $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line"); 
     243        last; 
    203244    } 
    204245 
     
    206247    my $csv_line_filename_array = $csv_line_metadata{"Filename"}; 
    207248    if (!defined $csv_line_filename_array) { 
    208         $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line"); 
     249        $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line"); 
    209250        next; 
    210251    } 
     
    255296    } 
    256297    # maps the file to full path 
    257     &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename); 
     298    &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename_full_path); 
    258299    } 
    259300}