source: main/trunk/model-sites-dev/atea/collect/voxel-vr/perllib/plugins/MetadataYAMLPlugin.pm@ 34473

Last change on this file since 34473 was 34473, checked in by davidb, 4 years ago

Initial cut at files for Voxel-VR executables

File size: 14.9 KB
Line 
1###########################################################################
2#
3# MetadataYAMLPlugin.pm -- A plugin for metadata in YAML format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataYAMLPlugin;
28
29
30use BaseImporter;
31use MetadataRead;
32# use CSVFieldSeparator;
33
34use strict;
35no strict 'refs';
36
37use extrametautil;
38use multiread;
39use util;
40
41use Encode;
42use YAML;
43
44# methods with identical signatures take precedence in the order given in the ISA list.
45sub BEGIN {
46# @MetadataYAMLPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator');
47 @MetadataYAMLPlugin::ISA = ('MetadataRead', 'BaseImporter');
48}
49
50
51
52my $arguments = [
53 { 'name' => "process_exp",
54 'desc' => "{BaseImporter.process_exp}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => &get_default_process_exp() }
58
59];
60
61
62my $options = { 'name' => "MetadataYAMLPlugin",
63 'desc' => "{MetadataYAMLPlugin.desc}",
64 'abstract' => "no",
65 'inherits' => "yes",
66 'args' => $arguments };
67
68
69sub new
70{
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78## new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
79 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
80
81 return bless $self, $class;
82}
83
84
85sub get_default_process_exp
86{
87 return q^(?i)\.y(a?)ml$^;
88}
89
90sub file_block_read {
91 my $self = shift (@_);
92 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
93
94 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
95
96 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
97 return undef; # can't recognise
98 }
99
100 # set this so we know this is a metadata file - needed for incremental
101 # build
102 # if this file changes, then we need to reimport everything
103 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
104
105 return 1;
106}
107
108sub metadata_read
109{
110 my $self = shift (@_);
111 my ($pluginfo, $base_dir, $file, $block_hash,
112 $extrametakeys, $extrametadata, $extrametafile,
113 $processor, $gli, $aux) = @_;
114
115 # can we process this file??
116 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
117 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
118
119 print STDERR "\n<Processing n='$file' p='MetadataYAMLPlugin'>\n" if ($gli);
120 print STDERR "MetadataYAMLPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
121
122 my $outhandle = $self->{'outhandle'};
123 my $failhandle = $self->{'failhandle'};
124
125 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
126 $self->block_raw_filename($block_hash,$filename_full_path);
127
128 # Read the YAML file to get the metadata
129 my $yaml_file_content;
130 open(YAML_FILE, "$filename_full_path");
131 my $yaml_file_reader = new multiread();
132 $yaml_file_reader->set_handle('MetadataYAMLPlugin::YAML_FILE');
133 $yaml_file_reader->read_file(\$yaml_file_content);
134
135 # Would be nice if MetadataYAMLPlugin was extended to support a minus
136 # option to choose the character encoding the YAML file is in
137 # For now we will assume it is always in UTF8
138 $yaml_file_content = decode("utf8",$yaml_file_content);
139
140 close(YAML_FILE);
141
142 # my $config = YAML::LoadFile("$filename_full_path");
143 my $yaml_rec = YAML::Load("$yaml_file_content");
144
145 # print Dumper($yaml_rec);
146
147 if (!defined $yaml_rec->{'Filename'}) {
148 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in YAML metadata file");
149 return -1; # error
150 }
151
152 # We can't associate any metadata without knowing the file to associate it with
153 my $yaml_rec_filename = $yaml_rec->{"Filename"};
154 delete $yaml_rec->{"Filename"};
155
156 # Build a hash of metadata name to metadata value for this line
157 my %yaml_rec_metadata = ();
158
159 foreach my $md_name (keys(%{$yaml_rec})) {
160
161 my $md_val = $yaml_rec->{$md_name};
162
163 my @md_vals;
164
165 # Treat everything as an array of metadata values to simplify later code
166 if (ref(\$md_val) eq 'SCALAR') {
167 @md_vals = ( $md_val );
168 }
169 else {
170 @md_vals = @$md_val;
171 }
172
173 # protect square brackets in metadata values by hex entity encoding them
174 # As unescaped square bracket chars in metadata
175 # have special meaning in GS' Java runtime code
176 my @escaped_md_vals = ();
177
178 for my $meta_value (@md_vals) {
179 print STDERR "**** meta_value = $meta_value\n";
180
181 $meta_value =~ s/\[/&\#091;/g;
182 $meta_value =~ s/\]/&\#093;/g;
183 push(@escaped_md_vals, $meta_value);
184 }
185
186 push (@{$yaml_rec_metadata{$md_name}}, @escaped_md_vals);
187 }
188
189
190# my $csv_line_section_array = $csv_line_metadata{"Section"};
191# my $section_suffix = "";
192# if (defined $csv_line_section_array) {
193# my $section_value = shift(@$csv_line_section_array);
194# if ($section_value =~ /[\d.]+/m){
195# my $section_suffix = "///Section/" . $section_value;
196# foreach my $metaname (keys %csv_line_metadata) {
197# my $new_name = $metaname . $section_suffix;
198# $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
199# }
200# } else{
201# unshift(@$csv_line_section_array, $section_value);
202# }
203# }
204
205
206 # Associate the metadata now
207 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
208 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
209 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
210 $yaml_rec_filename = &util::filepath_to_url_format($yaml_rec_filename);
211 $yaml_rec_filename = &util::filename_to_regex($yaml_rec_filename);
212
213 print STDERR "**** yaml_rec_filename = $yaml_rec_filename\n";
214
215
216 use Data::Dumper;
217 print Dumper(\%yaml_rec_metadata);
218
219 if (defined &extrametautil::getmetadata($extrametadata, $yaml_rec_filename)) { # merge with existing meta
220
221 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $yaml_rec_filename);
222
223 foreach my $metaname (keys %yaml_rec_metadata) {
224 # will create new entry if one does not already exist
225 push(@{$file_metadata_table->{$metaname}}, @{$yaml_rec_metadata{$metaname}});
226 }
227
228 # no need to push $file on to $extrametakeys as it is already in the list
229 }
230 else { # add as new meta
231
232 &extrametautil::setmetadata($extrametadata, $yaml_rec_filename, \%yaml_rec_metadata);
233 &extrametautil::addmetakey($extrametakeys, $yaml_rec_filename);
234 }
235
236 # record which file the metadata came from
237 if (!defined &extrametautil::getmetafile($extrametafile, $yaml_rec_filename)) {
238 &extrametautil::setmetafile($extrametafile, $yaml_rec_filename, {});
239 }
240
241 # maps the file to full path
242 &extrametautil::setmetafile_for_named_file($extrametafile, $yaml_rec_filename, $file, $filename_full_path);
243}
244
245
246sub foo
247{
248 my ($self) = shift(@_);
249 my ($pluginfo, $base_dir, $file, $block_hash,
250 $extrametakeys, $extrametadata, $extrametafile,
251 $processor, $gli, $aux) = @_;
252
253 # can we process this file??
254 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
255 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
256
257 print STDERR "\n<Processing n='$file' p='MetadataYAMLPlugin'>\n" if ($gli);
258 print STDERR "MetadataYAMLPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
259
260 my $outhandle = $self->{'outhandle'};
261 my $failhandle = $self->{'failhandle'};
262
263
264 my $csv_file_field_line;
265 my @csv_file_lines = ();
266 my $md_val_sep = ",";
267
268 my $csv = Text::CSV->new();
269 # $csv->sep_char($separate_char);
270
271 my @csv_file_fields = undef;
272 if ($csv->parse($csv_file_field_line)) {
273 @csv_file_fields = $csv->fields;
274 }
275 else {
276 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV head\
277er line: $csv_file_field_line");
278 return -1;
279 }
280
281 my $found_filename_field = 0;
282 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
283 # Remove any spaces from the field names, and surrounding quotes too
284 $csv_file_fields[$i] =~ s/ //g;
285 $csv_file_fields[$i] =~ s/^"//;
286 $csv_file_fields[$i] =~ s/"$//;
287
288 if ($csv_file_fields[$i] eq "Filename") {
289 $found_filename_field = 1;
290 }
291 }
292
293 if (!$found_filename_field) {
294 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file");
295 return -1; # error
296 }
297
298 # Read each line of the file and assign the metadata appropriately
299 foreach my $csv_line (@csv_file_lines) {
300 # Ignore lines containing only whitespace
301 next if ($csv_line =~ /^\s*$/);
302 my $orig_csv_line = $csv_line;
303
304 # Build a hash of metadata name to metadata value for this line
305 my %csv_line_metadata;
306
307 if ($csv->parse($csv_line)) {
308 my @md_vals = $csv->fields;
309 my $md_vals_len = scalar(@md_vals);
310
311 for (my $i=0; $i<$md_vals_len; $i++) {
312 my $md_val = $md_vals[$i];
313 # Only bother with non-empty values
314 if ($md_val ne "" && defined($csv_file_fields[$i])) {
315
316 my $md_name = $csv_file_fields[$i];
317
318 if (!defined $md_name) {
319 $csv_line_metadata{$md_name} = [];
320 }
321
322 if (defined $md_val_sep) {
323
324 my @within_md_vals = split(/${md_val_sep}/,$md_val);
325 #push (@{$csv_line_metadata{$md_name}}, @within_md_vals);
326
327 # protect square brackets in metadata values by hex entity encoding them
328 # As unescaped square bracket chars in metadata
329 # have special meaning in GS' Java runtime code
330 my @escaped_within_md_vals = ();
331 for my $meta_value (@within_md_vals) {
332 $meta_value =~ s/\[/&\#091;/g;
333 $meta_value =~ s/\]/&\#093;/g;
334 push(@escaped_within_md_vals, $meta_value);
335 }
336 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
337
338# foreach my $within_md_val (@within_md_vals) {
339# push (@{$csv_line_metadata{$md_name}}, $within_md_val);
340# }
341 }
342 else {
343 #push (@{$csv_line_metadata{$md_name}}, $md_val);
344 # protect square brackets in metadata values by hex entity encoding them
345 my $escaped_metadata_value = $md_val;
346 $escaped_metadata_value =~ s/\[/&\#091;/g;
347 $escaped_metadata_value =~ s/\]/&\#093;/g;
348 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
349 }
350 }
351 }
352 }
353 else {
354 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line");
355 last;
356 }
357
358 # We can't associate any metadata without knowing the file to associate it with
359 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
360 if (!defined $csv_line_filename_array) {
361 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line");
362 next;
363 }
364 my $csv_line_filename = shift(@$csv_line_filename_array);
365 delete $csv_line_metadata{"Filename"};
366
367 my $csv_line_section_array = $csv_line_metadata{"Section"};
368 my $section_suffix = "";
369 if (defined $csv_line_section_array) {
370 my $section_value = shift(@$csv_line_section_array);
371 if ($section_value =~ /[\d.]+/m){
372 my $section_suffix = "///Section/" . $section_value;
373 foreach my $metaname (keys %csv_line_metadata) {
374 my $new_name = $metaname . $section_suffix;
375 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
376 }
377 } else{
378 unshift(@$csv_line_section_array, $section_value);
379 }
380 }
381
382
383 # Associate the metadata now
384 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
385 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
386 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
387 $csv_line_filename = &util::filepath_to_url_format($csv_line_filename);
388 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
389
390 if (defined &extrametautil::getmetadata($extrametadata, $csv_line_filename)) { # merge with existing meta
391
392 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $csv_line_filename);
393
394 foreach my $metaname (keys %csv_line_metadata) {
395 # will create new entry if one does not already exist
396 push(@{$file_metadata_table->{$metaname}}, @{$csv_line_metadata{$metaname}});
397 }
398
399 # no need to push $file on to $extrametakeys as it is already in the list
400 } else { # add as new meta
401
402 &extrametautil::setmetadata($extrametadata, $csv_line_filename, \%csv_line_metadata);
403 &extrametautil::addmetakey($extrametakeys, $csv_line_filename);
404 }
405 # record which file the metadata came from
406 if (!defined &extrametautil::getmetafile($extrametafile, $csv_line_filename)) {
407 &extrametautil::setmetafile($extrametafile, $csv_line_filename, {});
408 }
409 # maps the file to full path
410 &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename_full_path);
411 }
412}
413
414sub print_error
415{
416
417 my $self = shift(@_);
418 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
419
420 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
421 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
422 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
423}
4241;
Note: See TracBrowser for help on using the repository browser.