source: main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm@ 23484

Last change on this file since 23484 was 23419, checked in by max, 13 years ago

Setting the values to store as block files is now done through an API call to BasePlugin. This way, anything uniform requirement (such as putting in both C:\... and c:\... entries for Windows) can be done in one place.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BasePlugin;
31use strict;
32no strict 'refs';
33use multiread;
34
35
36sub BEGIN {
37 @MetadataCSVPlugin::ISA = ('BasePlugin');
38}
39
40
41my $arguments = [
42 { 'name' => "process_exp",
43 'desc' => "{BasePlugin.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() }
47
48];
49
50
51my $options = { 'name' => "MetadataCSVPlugin",
52 'desc' => "{MetadataCSVPlugin.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57
58sub new
59{
60 my ($class) = shift (@_);
61 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62 push(@$pluginlist, $class);
63
64 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65 push(@{$hashArgOptLists->{"OptList"}},$options);
66
67 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
68
69 return bless $self, $class;
70}
71
72
73sub get_default_process_exp
74{
75 return q^(?i)\.csv$^;
76}
77
78sub file_block_read {
79 my $self = shift (@_);
80 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
81
82 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
83
84 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
85 return undef; # can't recognise
86 }
87
88 # set this so we know this is a metadata file - needed for incremental
89 # build
90 # if this file changes, then we need to reimport everything
91 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
92
93 return 1;
94}
95
96sub metadata_read
97{
98 my $self = shift (@_);
99 my ($pluginfo, $base_dir, $file, $block_hash,
100 $extrametakeys, $extrametadata, $extrametafile,
101 $processor, $gli, $aux) = @_;
102
103 # Read metadata from CSV files
104 my $filename = &util::filename_cat($base_dir, $file);
105 if ($filename !~ /\.csv$/ || !-f $filename) {
106 return undef;
107 }
108 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
109 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
110
111 my $outhandle = $self->{'outhandle'};
112 my $failhandle = $self->{'failhandle'};
113
114 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
115 $self->block_filename($block_hash,$filename);
116
117 # Read the CSV file to get the metadata
118 my $csv_file_content;
119 open(CSV_FILE, "$filename");
120 my $csv_file_reader = new multiread();
121 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
122 $csv_file_reader->read_file(\$csv_file_content);
123 close(CSV_FILE);
124
125 # Split the file into lines and read the first line (contains the metadata names)
126 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
127 $csv_file_content =~ s/\n+/\n/g;
128 my @csv_file_lines = split(/\n/, $csv_file_content);
129 my $csv_file_field_line = shift(@csv_file_lines);
130 my @csv_file_fields = split(/\,/, $csv_file_field_line);
131 my $found_filename_field = 0;
132 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
133 # Remove any spaces from the field names
134 $csv_file_fields[$i] =~ s/ //g;
135 if ($csv_file_fields[$i] eq "Filename") {
136 $found_filename_field = 1;
137 }
138 }
139
140 if (!$found_filename_field) {
141 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
142 return -1; # error
143 }
144 # Read each line of the file and assign the metadata appropriately
145 foreach my $csv_line (@csv_file_lines) {
146 # Ignore lines containing only whitespace
147 next if ($csv_line =~ /^\s*$/);
148 my $orig_csv_line = $csv_line;
149 # Build a hash of metadata name to metadata value for this line
150 my %csv_line_metadata;
151 my $i = 0;
152 $csv_line .= ","; # To make the regular expressions simpler
153 while ($csv_line ne "") {
154 # Metadata values containing commas are quoted
155 if ($csv_line =~ s/^\"(.*?)\"\,//) {
156 # Only bother with non-empty values
157 if ($1 ne "" && defined($csv_file_fields[$i])) {
158 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
159 $csv_line_metadata{$csv_file_fields[$i]} = [];
160 }
161 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
162 }
163 }
164 # Normal comma-separated case
165 elsif ($csv_line =~ s/^(.*?)\,//) {
166 # Only bother with non-empty values
167 if ($1 ne "" && defined($csv_file_fields[$i])) {
168 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
169 $csv_line_metadata{$csv_file_fields[$i]} = [];
170 }
171 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
172 }
173 }
174 # The line must be formatted incorrectly
175 else {
176 $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
177 last;
178 }
179
180 $i++;
181 }
182
183 # We can't associate any metadata without knowing the file to associate it with
184 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
185 if (!defined $csv_line_filename_array) {
186 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
187 next;
188 }
189 my $csv_line_filename = shift(@$csv_line_filename_array);
190 delete $csv_line_metadata{"Filename"};
191
192
193 # Associate the metadata now
194 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
195
196 $extrametadata->{$csv_line_filename} = \%csv_line_metadata;
197 push(@$extrametakeys, $csv_line_filename);
198 # record which file the metadata came from
199 if (!defined $extrametafile->{$csv_line_filename}) {
200 $extrametafile->{$csv_line_filename} = {};
201 }
202 # maps the file to full path
203 $extrametafile->{$csv_line_filename}->{$file} = $filename;
204 }
205}
206
207sub print_error
208{
209
210 my $self = shift(@_);
211 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
212
213 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
214 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
215 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
216}
2171;
Note: See TracBrowser for help on using the repository browser.