source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/perllib/plugins/MetadataCSVPlugin.pm@ 35991

Last change on this file since 35991 was 35991, checked in by davidb, 2 years ago

Introduction of new feature to MetadataCSVPlugin that creates the filename by combining various metadata fields

File size: 12.1 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BaseImporter;
31use MetadataRead;
32use CSVFieldSeparator;
33
34use strict;
35no strict 'refs';
36
37use extrametautil;
38use multiread;
39use util;
40
41use Encode;
42use Text::CSV;
43
44# methods with identical signatures take precedence in the order given in the ISA list.
45sub BEGIN {
46 @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator');
47}
48
49
50
51my $arguments = [
52 { 'name' => "process_exp",
53 'desc' => "{BaseImporter.process_exp}",
54 'type' => "regexp",
55 'reqd' => "no",
56 'deft' => &get_default_process_exp() },
57 { 'name' => "filename_formed_from",
58 'desc' => "Specify in a comma separated list the metadata fields that get combined together to form the filename",
59 'type' => "string",
60 'reqd' => "no"
61 },
62 { 'name' => "filename_formed_with_no_spaces",
63 'desc' => "Set this flag to remove an spaces in the metadata values used to form the filename",
64 'type' => "flag",
65 'reqd' => "no"
66 },
67 { 'name' => "filename_formed_from_ext",
68 'desc' => "The filename extension to add on to the end of the filename_formed_from metadata",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => ".nul"
72 }
73];
74
75
76my $options = { 'name' => "MetadataCSVPlugin",
77 'desc' => "{MetadataCSVPlugin.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'args' => $arguments };
81
82
83sub new
84{
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
90 push(@{$hashArgOptLists->{"OptList"}},$options);
91
92 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
93 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
94
95 return bless $self, $class;
96}
97
98
99sub get_default_process_exp
100{
101 return q^(?i)\.csv$^;
102}
103
104sub file_block_read {
105 my $self = shift (@_);
106 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
107
108 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
109
110 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
111 return undef; # can't recognise
112 }
113
114 # set this so we know this is a metadata file - needed for incremental
115 # build
116 # if this file changes, then we need to reimport everything
117 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
118
119 return 1;
120}
121
122sub metadata_read
123{
124 my $self = shift (@_);
125 my ($pluginfo, $base_dir, $file, $block_hash,
126 $extrametakeys, $extrametadata, $extrametafile,
127 $processor, $gli, $aux) = @_;
128
129 # can we process this file??
130 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
131 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
132
133 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
134 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
135
136 my $outhandle = $self->{'outhandle'};
137 my $failhandle = $self->{'failhandle'};
138
139 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
140 $self->block_raw_filename($block_hash,$filename_full_path);
141
142
143 # Read the CSV file to get the metadata
144 my $csv_file_content;
145 open(CSV_FILE, "$filename_full_path");
146 my $csv_file_reader = new multiread();
147 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
148 $csv_file_reader->read_file(\$csv_file_content);
149
150 # Would be nice if MetadataCSVPlugin was extended to support a minus
151 # option to choose the character encoding the CSV file is in
152 # For now we will assume it is always in UTF8
153 $csv_file_content = decode("utf8",$csv_file_content);
154
155 close(CSV_FILE);
156
157 # Split the file into lines and read the first line (contains the metadata names)
158 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
159 $csv_file_content =~ s/\n+/\n/g;
160
161 my $separate_char = $self->{'csv_field_separator'};
162
163 my $md_val_sep = $self->{'metadata_value_separator'};
164 undef $md_val_sep if ($md_val_sep eq "");
165
166 my @csv_file_lines = split(/\n/, $csv_file_content);
167 my $csv_file_field_line = shift(@csv_file_lines);
168
169 if ($separate_char =~ m/^auto$/i) {
170 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
171 }
172
173 my $csv = Text::CSV->new();
174 $csv->sep_char($separate_char);
175
176 my @csv_file_fields = undef;
177 if ($csv->parse($csv_file_field_line)) {
178 @csv_file_fields = $csv->fields;
179 }
180 else {
181 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
182 return -1;
183 }
184
185 my $found_filename_field = 0;
186 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
187 # Remove any spaces from the field names, and surrounding quotes too
188 $csv_file_fields[$i] =~ s/ //g;
189 $csv_file_fields[$i] =~ s/^"//;
190 $csv_file_fields[$i] =~ s/"$//;
191
192 if ($csv_file_fields[$i] eq "Filename") {
193 $found_filename_field = 1;
194 }
195 }
196
197 if (!defined $self->{'filename_formed_from'}) {
198 if (!$found_filename_field) {
199 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file");
200 return -1; # error
201 }
202 }
203
204 # Read each line of the file and assign the metadata appropriately
205 foreach my $csv_line (@csv_file_lines) {
206 # Ignore lines containing only whitespace
207 next if ($csv_line =~ /^\s*$/);
208 my $orig_csv_line = $csv_line;
209
210 # Build a hash of metadata name to metadata value for this line
211 my %csv_line_metadata;
212
213 if ($csv->parse($csv_line)) {
214 my @md_vals = $csv->fields;
215 my $md_vals_len = scalar(@md_vals);
216
217 for (my $i=0; $i<$md_vals_len; $i++) {
218 my $md_val = $md_vals[$i];
219 # Only bother with non-empty values
220 if ($md_val ne "" && defined($csv_file_fields[$i])) {
221
222 my $md_name = $csv_file_fields[$i];
223
224 if (!defined $md_name) {
225 $csv_line_metadata{$md_name} = [];
226 }
227
228 if (defined $md_val_sep) {
229
230 my @within_md_vals = split(/${md_val_sep}/,$md_val);
231 #push (@{$csv_line_metadata{$md_name}}, @within_md_vals);
232
233 # protect square brackets in metadata values by hex entity encoding them
234 # As unescaped square bracket chars in metadata
235 # have special meaning in GS' Java runtime code
236 my @escaped_within_md_vals = ();
237 for my $meta_value (@within_md_vals) {
238 $meta_value =~ s/\[/&\#091;/g;
239 $meta_value =~ s/\]/&\#093;/g;
240 push(@escaped_within_md_vals, $meta_value);
241 }
242 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
243
244# foreach my $within_md_val (@within_md_vals) {
245# push (@{$csv_line_metadata{$md_name}}, $within_md_val);
246# }
247 }
248 else {
249 #push (@{$csv_line_metadata{$md_name}}, $md_val);
250 # protect square brackets in metadata values by hex entity encoding them
251 my $escaped_metadata_value = $md_val;
252 $escaped_metadata_value =~ s/\[/&\#091;/g;
253 $escaped_metadata_value =~ s/\]/&\#093;/g;
254 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
255 }
256 }
257 }
258 }
259 else {
260 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line");
261 last;
262 }
263
264 # We can't associate any metadata without knowing the file to associate it with
265 if (defined $self->{'filename_formed_from'}) {
266 my @filename_formed_from_metanames = split(",",$self->{'filename_formed_from'});
267
268 my $filename_metavalue = "";
269 foreach my $metaname (@filename_formed_from_metanames) {
270 my $metavalue = shift(@{$csv_line_metadata{$metaname}});
271 if (defined $self->{'filename_formed_with_no_spaces'}) {
272 $metavalue =~ s/\s+//g;
273 }
274
275
276 $filename_metavalue .= $metavalue if defined $metavalue;
277 }
278
279 if (defined $self->{'filename_formed_from_ext'}) {
280 $filename_metavalue .= $self->{'filename_formed_from_ext'};
281 }
282
283 if ($filename_metavalue ne "") {
284 # print STDERR "**** Setting up Metadata Filename match on: $filename_metavalue\n";
285
286 $csv_line_metadata{"Filename"} = [ $filename_metavalue ];
287 }
288 }
289
290 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
291 if (!defined $csv_line_filename_array) {
292 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line");
293 next;
294 }
295 my $csv_line_filename = shift(@$csv_line_filename_array);
296 delete $csv_line_metadata{"Filename"};
297
298 my $csv_line_section_array = $csv_line_metadata{"Section"};
299 my $section_suffix = "";
300 if (defined $csv_line_section_array) {
301 my $section_value = shift(@$csv_line_section_array);
302 if ($section_value =~ /[\d.]+/m){
303 my $section_suffix = "///Section/" . $section_value;
304 foreach my $metaname (keys %csv_line_metadata) {
305 my $new_name = $metaname . $section_suffix;
306 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
307 }
308 } else{
309 unshift(@$csv_line_section_array, $section_value);
310 }
311 }
312
313 # For CSV files to include line-breaks these are typically stored as '\n'
314 # Unescape them here so they present more nicely in the browser
315 #
316 # foreach my $metaname (keys %csv_line_metadata) {
317 # my $csv_line_metavalue_array = $csv_line_metadata{$metaname};
318 #
319 # map { $_ =~ s/\\n/\n/g } @$csv_line_metavalue_array;
320 # }
321
322
323 # Associate the metadata now
324 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
325 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
326 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
327 $csv_line_filename = &util::filepath_to_url_format($csv_line_filename);
328 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
329
330 if (defined &extrametautil::getmetadata($extrametadata, $csv_line_filename)) { # merge with existing meta
331
332 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $csv_line_filename);
333
334 foreach my $metaname (keys %csv_line_metadata) {
335 # will create new entry if one does not already exist
336 push(@{$file_metadata_table->{$metaname}}, @{$csv_line_metadata{$metaname}});
337 }
338
339 # no need to push $file on to $extrametakeys as it is already in the list
340 } else { # add as new meta
341
342 &extrametautil::setmetadata($extrametadata, $csv_line_filename, \%csv_line_metadata);
343 &extrametautil::addmetakey($extrametakeys, $csv_line_filename);
344 }
345 # record which file the metadata came from
346 if (!defined &extrametautil::getmetafile($extrametafile, $csv_line_filename)) {
347 &extrametautil::setmetafile($extrametafile, $csv_line_filename, {});
348 }
349 # maps the file to full path
350 &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename_full_path);
351 }
352}
353
354sub print_error
355{
356
357 my $self = shift(@_);
358 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
359
360 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
361 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
362 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
363}
3641;
Note: See TracBrowser for help on using the repository browser.