source: main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm@ 34249

Last change on this file since 34249 was 34249, checked in by ak19, 4 years ago

Dr Bainbridge in his commit 32810 had expressed that he intended to commit his MetadataCSVPlugin related work for dlheritage to the main GS after the then upcoming GS3 release. His plugin changes support multiple values for a metadata field work and these changes for me in the GS3tutorials collection that uses a metadata.csv file. Like dlheritage, I also use the pipe symbol to separate multiple meta values for a meta field/column. Kathy had made a bugfix to MetadataCSVPlugin since Dr Bainbridge's branched the code off for dlheritage. I will incorporate her bugfix into Dr Bainbridge's work and test things still work and will commit that separately next. Committing from uni machine, as something weird about WMTB VM where I tested these plugin changes and additions: svn committing hasn't been working for a few days now but freezes trying to transmit data.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BaseImporter;
31use MetadataRead;
32use CSVFieldSeparator;
33
34use strict;
35no strict 'refs';
36
37use extrametautil;
38use multiread;
39use util;
40
41use Encode;
42use Text::CSV;
43
44# methods with identical signatures take precedence in the order given in the ISA list.
45sub BEGIN {
46 @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator');
47}
48
49
50
51my $arguments = [
52 { 'name' => "process_exp",
53 'desc' => "{BaseImporter.process_exp}",
54 'type' => "regexp",
55 'reqd' => "no",
56 'deft' => &get_default_process_exp() }
57
58];
59
60
61my $options = { 'name' => "MetadataCSVPlugin",
62 'desc' => "{MetadataCSVPlugin.desc}",
63 'abstract' => "no",
64 'inherits' => "yes",
65 'args' => $arguments };
66
67
68sub new
69{
70 my ($class) = shift (@_);
71 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72 push(@$pluginlist, $class);
73
74 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
75 push(@{$hashArgOptLists->{"OptList"}},$options);
76
77 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
78 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
79
80 return bless $self, $class;
81}
82
83
84sub get_default_process_exp
85{
86 return q^(?i)\.csv$^;
87}
88
89sub file_block_read {
90 my $self = shift (@_);
91 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
92
93 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
94
95 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
96 return undef; # can't recognise
97 }
98
99 # set this so we know this is a metadata file - needed for incremental
100 # build
101 # if this file changes, then we need to reimport everything
102 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
103
104 return 1;
105}
106
107sub metadata_read
108{
109 my $self = shift (@_);
110 my ($pluginfo, $base_dir, $file, $block_hash,
111 $extrametakeys, $extrametadata, $extrametafile,
112 $processor, $gli, $aux) = @_;
113
114 # can we process this file??
115 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
116 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
117
118 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
119 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
120
121 my $outhandle = $self->{'outhandle'};
122 my $failhandle = $self->{'failhandle'};
123
124 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
125 $self->block_raw_filename($block_hash,$filename_full_path);
126
127
128 # Read the CSV file to get the metadata
129 my $csv_file_content;
130 open(CSV_FILE, "$filename_full_path");
131 my $csv_file_reader = new multiread();
132 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
133 $csv_file_reader->read_file(\$csv_file_content);
134
135 # Would be nice if MetadataCSVPlugin was extended to support a minus
136 # option to choose the character encoding the CSV file is in
137 # For now we will assume it is always in UTF8
138 $csv_file_content = decode("utf8",$csv_file_content);
139
140 close(CSV_FILE);
141
142 # Split the file into lines and read the first line (contains the metadata names)
143 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
144 $csv_file_content =~ s/\n+/\n/g;
145
146 my $separate_char = $self->{'csv_field_separator'};
147
148 my $md_val_sep = $self->{'metadata_value_separator'};
149 undef $md_val_sep if ($md_val_sep eq "");
150
151 my @csv_file_lines = split(/\n/, $csv_file_content);
152 my $csv_file_field_line = shift(@csv_file_lines);
153
154 if ($separate_char =~ m/^auto$/i) {
155 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
156 }
157
158 my $csv = Text::CSV->new();
159 $csv->sep_char($separate_char);
160
161 my @csv_file_fields = undef;
162 if ($csv->parse($csv_file_field_line)) {
163 @csv_file_fields = $csv->fields;
164 }
165 else {
166 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
167 return -1;
168 }
169
170 my $found_filename_field = 0;
171 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
172 # Remove any spaces from the field names, and surrounding quotes too
173 $csv_file_fields[$i] =~ s/ //g;
174 $csv_file_fields[$i] =~ s/^"//;
175 $csv_file_fields[$i] =~ s/"$//;
176
177 if ($csv_file_fields[$i] eq "Filename") {
178 $found_filename_field = 1;
179 }
180 }
181
182 if (!$found_filename_field) {
183 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file");
184 return -1; # error
185 }
186 # Read each line of the file and assign the metadata appropriately
187 foreach my $csv_line (@csv_file_lines) {
188 # Ignore lines containing only whitespace
189 next if ($csv_line =~ /^\s*$/);
190 my $orig_csv_line = $csv_line;
191
192 # Build a hash of metadata name to metadata value for this line
193 my %csv_line_metadata;
194
195 if ($csv->parse($csv_line)) {
196 my @md_vals = $csv->fields;
197 my $md_vals_len = scalar(@md_vals);
198
199 for (my $i=0; $i<$md_vals_len; $i++) {
200 my $md_val = $md_vals[$i];
201 # Only bother with non-empty values
202 if ($md_val ne "" && defined($csv_file_fields[$i])) {
203
204 my $md_name = $csv_file_fields[$i];
205
206 if (!defined $md_name) {
207 $csv_line_metadata{$md_name} = [];
208 }
209
210 if (defined $md_val_sep) {
211
212 my @within_md_vals = split(/${md_val_sep}/,$md_val);
213 #push (@{$csv_line_metadata{$md_name}}, @within_md_vals);
214
215 # protect square brackets in metadata values by hex entity encoding them
216 # As unescaped square bracket chars in metadata
217 # have special meaning in GS' Java runtime code
218 my @escaped_within_md_vals = ();
219 for my $meta_value (@within_md_vals) {
220 $meta_value =~ s/\[/&\#091;/g;
221 $meta_value =~ s/\]/&\#093;/g;
222 push(@escaped_within_md_vals, $meta_value);
223 }
224 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
225
226# foreach my $within_md_val (@within_md_vals) {
227# push (@{$csv_line_metadata{$md_name}}, $within_md_val);
228# }
229 }
230 else {
231 #push (@{$csv_line_metadata{$md_name}}, $md_val);
232 # protect square brackets in metadata values by hex entity encoding them
233 my $escaped_metadata_value = $md_val;
234 $escaped_metadata_value =~ s/\[/&\#091;/g;
235 $escaped_metadata_value =~ s/\]/&\#093;/g;
236 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
237 }
238 }
239 }
240 }
241 else {
242 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line");
243 last;
244 }
245
246 # We can't associate any metadata without knowing the file to associate it with
247 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
248 if (!defined $csv_line_filename_array) {
249 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line");
250 next;
251 }
252 my $csv_line_filename = shift(@$csv_line_filename_array);
253 delete $csv_line_metadata{"Filename"};
254
255 my $csv_line_section_array = $csv_line_metadata{"Section"};
256 my $section_suffix = "";
257 if (defined $csv_line_section_array) {
258 my $section_value = shift(@$csv_line_section_array);
259 if ($section_value =~ /[\d.]+/m){
260 my $section_suffix = "///Section/" . $section_value;
261 foreach my $metaname (keys %csv_line_metadata) {
262 my $new_name = $metaname . $section_suffix;
263 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
264 }
265 } else{
266 unshift(@$csv_line_section_array, $section_value);
267 }
268 }
269
270
271 # Associate the metadata now
272 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
273 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
274 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
275 $csv_line_filename = &util::filepath_to_url_format($csv_line_filename);
276 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
277
278 if (defined &extrametautil::getmetadata($extrametadata, $csv_line_filename)) { # merge with existing meta
279
280 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $csv_line_filename);
281
282 foreach my $metaname (keys %csv_line_metadata) {
283 # will create new entry if one does not already exist
284 push(@{$file_metadata_table->{$metaname}}, @{$csv_line_metadata{$metaname}});
285 }
286
287 # no need to push $file on to $extrametakeys as it is already in the list
288 } else { # add as new meta
289
290 &extrametautil::setmetadata($extrametadata, $csv_line_filename, \%csv_line_metadata);
291 &extrametautil::addmetakey($extrametakeys, $csv_line_filename);
292 }
293 # record which file the metadata came from
294 if (!defined &extrametautil::getmetafile($extrametafile, $csv_line_filename)) {
295 &extrametautil::setmetafile($extrametafile, $csv_line_filename, {});
296 }
297 # maps the file to full path
298 &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename_full_path);
299 }
300}
301
302sub print_error
303{
304
305 my $self = shift(@_);
306 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
307
308 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
309 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
310 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
311}
3121;
Note: See TracBrowser for help on using the repository browser.