root/main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm @ 32501

Revision 32501, 9.1 KB (checked in by litvinovg, 22 months ago)

Workaround to set assign metadata via csv metadata plugin. "Section" column could be used in csv file to specify section for metadata to assign

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BaseImporter;
31use MetadataRead;
32
33use strict;
34no strict 'refs';
35
36use extrametautil;
37use multiread;
38use util;
39
40use Encode;
41
42# methods with identical signatures take precedence in the order given in the ISA list.
43sub BEGIN {
44    @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter');
45}
46
47
48my $arguments = [
49      { 'name' => "process_exp",
50    'desc' => "{BaseImporter.process_exp}",
51    'type' => "regexp",
52    'reqd' => "no",
53    'deft' => &get_default_process_exp() }
54
55];
56
57
58my $options = { 'name'     => "MetadataCSVPlugin",
59        'desc'     => "{MetadataCSVPlugin.desc}",
60        'abstract' => "no",
61        'inherits' => "yes",
62        'args'     => $arguments };
63
64
65sub new
66{
67    my ($class) = shift (@_);
68    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
69    push(@$pluginlist, $class);
70
71    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
72    push(@{$hashArgOptLists->{"OptList"}},$options);
73
74    my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
75
76    return bless $self, $class;
77}
78
79
80sub get_default_process_exp
81{
82    return q^(?i)\.csv$^;
83}
84
85sub file_block_read {
86    my $self = shift (@_);
87    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
88
89    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
90
91    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
92    return undef; # can't recognise
93    }
94
95    # set this so we know this is a metadata file - needed for incremental
96    # build
97    # if this file changes, then we need to reimport everything
98    $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
99
100    return 1;
101}
102
103sub metadata_read
104{
105    my $self = shift (@_);
106    my ($pluginfo, $base_dir, $file, $block_hash,
107    $extrametakeys, $extrametadata, $extrametafile,
108    $processor, $gli, $aux) = @_;
109
110    # Read metadata from CSV files
111    my $filename = &util::filename_cat($base_dir, $file);
112    if ($filename !~ /\.csv$/ || !-f $filename) {
113    return undef;
114    }
115    print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
116    print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
117
118    my $outhandle = $self->{'outhandle'};
119    my $failhandle = $self->{'failhandle'};
120
121    # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
122    $self->block_raw_filename($block_hash,$filename);
123
124
125    # Read the CSV file to get the metadata
126    my $csv_file_content;
127    open(CSV_FILE, "$filename");
128    my $csv_file_reader = new multiread();
129    $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
130    $csv_file_reader->read_file(\$csv_file_content);
131
132    # Would be nice if MetadataCSVPlugin was extended to support a minus
133    # option to choose the character encoding the CSV file is in
134    # For now we will assume it is always in UTF8
135    $csv_file_content = decode("utf8",$csv_file_content);
136
137    close(CSV_FILE);
138
139    # Split the file into lines and read the first line (contains the metadata names)
140    $csv_file_content =~ s/\r/\n/g;  # Handle non-Unix line endings
141    $csv_file_content =~ s/\n+/\n/g;
142    my @csv_file_lines = split(/\n/, $csv_file_content);
143    my $csv_file_field_line = shift(@csv_file_lines);
144    my @csv_file_fields = split(/\,/, $csv_file_field_line);
145    my $found_filename_field = 0;
146    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
147    # Remove any spaces from the field names, and surrounding quotes too
148    $csv_file_fields[$i] =~ s/ //g;
149    $csv_file_fields[$i] =~ s/^"//;
150    $csv_file_fields[$i] =~ s/"$//;
151
152    if ($csv_file_fields[$i] eq "Filename") {
153        $found_filename_field = 1;
154    }
155    }
156
157    if (!$found_filename_field) {
158    $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
159    return -1; # error
160    }
161    # Read each line of the file and assign the metadata appropriately
162    foreach my $csv_line (@csv_file_lines) {
163    # Ignore lines containing only whitespace
164    next if ($csv_line =~ /^\s*$/);
165    my $orig_csv_line = $csv_line;
166    # Build a hash of metadata name to metadata value for this line
167    my %csv_line_metadata;
168    my $i = 0;
169    $csv_line .= ",";  # To make the regular expressions simpler
170    while ($csv_line ne "") {
171        # Metadata values containing commas are quoted
172        if ($csv_line =~ s/^\"(.*?)\"\,//) {
173        # Only bother with non-empty values
174        if ($1 ne "" && defined($csv_file_fields[$i])) {
175            if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
176            $csv_line_metadata{$csv_file_fields[$i]} = [];
177            }
178            push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
179        }
180        }
181        # Normal comma-separated case
182        elsif ($csv_line =~ s/^(.*?)\,//) {
183        # Only bother with non-empty values
184        if ($1 ne "" && defined($csv_file_fields[$i])) {
185            if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
186            $csv_line_metadata{$csv_file_fields[$i]} = [];
187            }
188            # remove any surrounding quotes. (When exporting to CSV, some spreadsheet
189            # programs add quotes even around field values that don't contain commas.)
190            my $value = $1;
191            $value =~ s/^"//;
192            $value =~ s/"$//;
193            push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $value);
194        }
195        }
196        # The line must be formatted incorrectly
197        else {
198        $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
199        last;
200        }
201
202        $i++;
203    }
204
205    # We can't associate any metadata without knowing the file to associate it with
206    my $csv_line_filename_array = $csv_line_metadata{"Filename"};
207    if (!defined $csv_line_filename_array) {
208        $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
209        next;
210    }
211    my $csv_line_filename = shift(@$csv_line_filename_array);
212    delete $csv_line_metadata{"Filename"};
213
214    my $csv_line_section_array = $csv_line_metadata{"Section"};
215    my $section_suffix = "";
216    if (defined $csv_line_section_array) {
217        my $section_value = shift(@$csv_line_section_array);
218        if ($section_value =~ /[\d.]+/m){
219            my $section_suffix = "///Section/" . $section_value;
220            foreach my $metaname (keys %csv_line_metadata) {
221                my $new_name = $metaname . $section_suffix;
222                $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
223            }
224        } else{
225            unshift(@$csv_line_section_array, $section_value);
226        }
227    }
228
229
230    # Associate the metadata now
231    # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
232    # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
233    # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
234    $csv_line_filename = &util::filepath_to_url_format($csv_line_filename);
235    $csv_line_filename = &util::filename_to_regex($csv_line_filename);
236
237    if (defined &extrametautil::getmetadata($extrametadata, $csv_line_filename)) { # merge with existing meta   
238
239        my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $csv_line_filename);
240       
241        foreach my $metaname (keys %csv_line_metadata) {
242        # will create new entry if one does not already exist
243        push(@{$file_metadata_table->{$metaname}}, @{$csv_line_metadata{$metaname}});       
244        }
245       
246        # no need to push $file on to $extrametakeys as it is already in the list
247    } else { # add as new meta
248       
249        &extrametautil::setmetadata($extrametadata, $csv_line_filename, \%csv_line_metadata);
250        &extrametautil::addmetakey($extrametakeys, $csv_line_filename);
251    }
252    # record which file the metadata came from
253    if (!defined &extrametautil::getmetafile($extrametafile, $csv_line_filename)) {
254        &extrametautil::setmetafile($extrametafile, $csv_line_filename, {});
255    }
256    # maps the file to full path
257    &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename);
258    }
259}
260
261sub print_error
262{
263
264    my $self = shift(@_);
265    my ($outhandle, $failhandle, $gli, $file, $error) = @_;
266
267    print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
268    print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
269    print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
270}
2711;
Note: See TracBrowser for help on using the browser.