source: main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm@ 24547

Last change on this file since 24547 was 24547, checked in by ak19, 13 years ago

Added new abstract plugin MetadataRead that defines can_process_this_file_for_metadata that MetadataPlugin subclasses can inherit (if MetadataRead is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BasePlugin;
31use MetadataRead;
32
33use strict;
34no strict 'refs';
35use multiread;
36
37
38# methods with identical signatures take precedence in the order given in the ISA list.
39sub BEGIN {
40 @MetadataCSVPlugin::ISA = ('MetadataRead', 'BasePlugin');
41}
42
43
44my $arguments = [
45 { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "regexp",
48 'reqd' => "no",
49 'deft' => &get_default_process_exp() }
50
51];
52
53
54my $options = { 'name' => "MetadataCSVPlugin",
55 'desc' => "{MetadataCSVPlugin.desc}",
56 'abstract' => "no",
57 'inherits' => "yes",
58 'args' => $arguments };
59
60
61sub new
62{
63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
66
67 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
69
70 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
71
72 return bless $self, $class;
73}
74
75
76sub get_default_process_exp
77{
78 return q^(?i)\.csv$^;
79}
80
81sub file_block_read {
82 my $self = shift (@_);
83 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
84
85 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
86
87 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
88 return undef; # can't recognise
89 }
90
91 # set this so we know this is a metadata file - needed for incremental
92 # build
93 # if this file changes, then we need to reimport everything
94 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
95
96 return 1;
97}
98
99sub metadata_read
100{
101 my $self = shift (@_);
102 my ($pluginfo, $base_dir, $file, $block_hash,
103 $extrametakeys, $extrametadata, $extrametafile,
104 $processor, $gli, $aux) = @_;
105
106 # Read metadata from CSV files
107 my $filename = &util::filename_cat($base_dir, $file);
108 if ($filename !~ /\.csv$/ || !-f $filename) {
109 return undef;
110 }
111 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
112 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
113
114 my $outhandle = $self->{'outhandle'};
115 my $failhandle = $self->{'failhandle'};
116
117 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
118 &util::block_filename($block_hash,$filename);
119
120 # Read the CSV file to get the metadata
121 my $csv_file_content;
122 open(CSV_FILE, "$filename");
123 my $csv_file_reader = new multiread();
124 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
125 $csv_file_reader->read_file(\$csv_file_content);
126 close(CSV_FILE);
127
128 # Split the file into lines and read the first line (contains the metadata names)
129 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
130 $csv_file_content =~ s/\n+/\n/g;
131 my @csv_file_lines = split(/\n/, $csv_file_content);
132 my $csv_file_field_line = shift(@csv_file_lines);
133 my @csv_file_fields = split(/\,/, $csv_file_field_line);
134 my $found_filename_field = 0;
135 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
136 # Remove any spaces from the field names
137 $csv_file_fields[$i] =~ s/ //g;
138 if ($csv_file_fields[$i] eq "Filename") {
139 $found_filename_field = 1;
140 }
141 }
142
143 if (!$found_filename_field) {
144 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
145 return -1; # error
146 }
147 # Read each line of the file and assign the metadata appropriately
148 foreach my $csv_line (@csv_file_lines) {
149 # Ignore lines containing only whitespace
150 next if ($csv_line =~ /^\s*$/);
151 my $orig_csv_line = $csv_line;
152 # Build a hash of metadata name to metadata value for this line
153 my %csv_line_metadata;
154 my $i = 0;
155 $csv_line .= ","; # To make the regular expressions simpler
156 while ($csv_line ne "") {
157 # Metadata values containing commas are quoted
158 if ($csv_line =~ s/^\"(.*?)\"\,//) {
159 # Only bother with non-empty values
160 if ($1 ne "" && defined($csv_file_fields[$i])) {
161 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
162 $csv_line_metadata{$csv_file_fields[$i]} = [];
163 }
164 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
165 }
166 }
167 # Normal comma-separated case
168 elsif ($csv_line =~ s/^(.*?)\,//) {
169 # Only bother with non-empty values
170 if ($1 ne "" && defined($csv_file_fields[$i])) {
171 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
172 $csv_line_metadata{$csv_file_fields[$i]} = [];
173 }
174 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
175 }
176 }
177 # The line must be formatted incorrectly
178 else {
179 $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
180 last;
181 }
182
183 $i++;
184 }
185
186 # We can't associate any metadata without knowing the file to associate it with
187 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
188 if (!defined $csv_line_filename_array) {
189 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
190 next;
191 }
192 my $csv_line_filename = shift(@$csv_line_filename_array);
193 delete $csv_line_metadata{"Filename"};
194
195
196 # Associate the metadata now
197 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
198
199 $extrametadata->{$csv_line_filename} = \%csv_line_metadata;
200 push(@$extrametakeys, $csv_line_filename);
201 # record which file the metadata came from
202 if (!defined $extrametafile->{$csv_line_filename}) {
203 $extrametafile->{$csv_line_filename} = {};
204 }
205 # maps the file to full path
206 $extrametafile->{$csv_line_filename}->{$file} = $filename;
207 }
208}
209
210sub print_error
211{
212
213 my $self = shift(@_);
214 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
215
216 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
217 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
218 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
219}
2201;
Note: See TracBrowser for help on using the repository browser.