source: main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm@ 24736

Last change on this file since 24736 was 24736, checked in by ak19, 13 years ago

Perl strings that result from reading in data from text files need to be Unicode aware strings.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BasePlugin;
31use MetadataRead;
32
33use strict;
34no strict 'refs';
35use multiread;
36
37
38# methods with identical signatures take precedence in the order given in the ISA list.
39sub BEGIN {
40 @MetadataCSVPlugin::ISA = ('MetadataRead', 'BasePlugin');
41}
42
43
44my $arguments = [
45 { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "regexp",
48 'reqd' => "no",
49 'deft' => &get_default_process_exp() }
50
51];
52
53
54my $options = { 'name' => "MetadataCSVPlugin",
55 'desc' => "{MetadataCSVPlugin.desc}",
56 'abstract' => "no",
57 'inherits' => "yes",
58 'args' => $arguments };
59
60
61sub new
62{
63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
66
67 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
69
70 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
71
72 return bless $self, $class;
73}
74
75
76sub get_default_process_exp
77{
78 return q^(?i)\.csv$^;
79}
80
81sub file_block_read {
82 my $self = shift (@_);
83 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
84
85 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
86
87 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
88 return undef; # can't recognise
89 }
90
91 # set this so we know this is a metadata file - needed for incremental
92 # build
93 # if this file changes, then we need to reimport everything
94 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
95
96 return 1;
97}
98
99sub metadata_read
100{
101 my $self = shift (@_);
102 my ($pluginfo, $base_dir, $file, $block_hash,
103 $extrametakeys, $extrametadata, $extrametafile,
104 $processor, $gli, $aux) = @_;
105
106 # Read metadata from CSV files
107 my $filename = &util::filename_cat($base_dir, $file);
108 if ($filename !~ /\.csv$/ || !-f $filename) {
109 return undef;
110 }
111 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
112 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
113
114 my $outhandle = $self->{'outhandle'};
115 my $failhandle = $self->{'failhandle'};
116
117 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
118 &util::block_filename($block_hash,$filename);
119
120
121 # Read the CSV file to get the metadata
122 my $csv_file_content;
123 open(CSV_FILE, "$filename");
124 my $csv_file_reader = new multiread();
125 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
126 $csv_file_reader->read_file(\$csv_file_content);
127
128 # Would be nice if MetadataCSVPlugin was extended to support a minus
129 # option to choose the character encoding the CSV file is in
130 # For now we will assume it is always in UTF8
131 $csv_file_content = decode("utf8",$csv_file_content);
132
133 close(CSV_FILE);
134
135 # Split the file into lines and read the first line (contains the metadata names)
136 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
137 $csv_file_content =~ s/\n+/\n/g;
138 my @csv_file_lines = split(/\n/, $csv_file_content);
139 my $csv_file_field_line = shift(@csv_file_lines);
140 my @csv_file_fields = split(/\,/, $csv_file_field_line);
141 my $found_filename_field = 0;
142 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
143 # Remove any spaces from the field names
144 $csv_file_fields[$i] =~ s/ //g;
145 if ($csv_file_fields[$i] eq "Filename") {
146 $found_filename_field = 1;
147 }
148 }
149
150 if (!$found_filename_field) {
151 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
152 return -1; # error
153 }
154 # Read each line of the file and assign the metadata appropriately
155 foreach my $csv_line (@csv_file_lines) {
156 # Ignore lines containing only whitespace
157 next if ($csv_line =~ /^\s*$/);
158 my $orig_csv_line = $csv_line;
159 # Build a hash of metadata name to metadata value for this line
160 my %csv_line_metadata;
161 my $i = 0;
162 $csv_line .= ","; # To make the regular expressions simpler
163 while ($csv_line ne "") {
164 # Metadata values containing commas are quoted
165 if ($csv_line =~ s/^\"(.*?)\"\,//) {
166 # Only bother with non-empty values
167 if ($1 ne "" && defined($csv_file_fields[$i])) {
168 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
169 $csv_line_metadata{$csv_file_fields[$i]} = [];
170 }
171 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
172 }
173 }
174 # Normal comma-separated case
175 elsif ($csv_line =~ s/^(.*?)\,//) {
176 # Only bother with non-empty values
177 if ($1 ne "" && defined($csv_file_fields[$i])) {
178 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
179 $csv_line_metadata{$csv_file_fields[$i]} = [];
180 }
181 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
182 }
183 }
184 # The line must be formatted incorrectly
185 else {
186 $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
187 last;
188 }
189
190 $i++;
191 }
192
193 # We can't associate any metadata without knowing the file to associate it with
194 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
195 if (!defined $csv_line_filename_array) {
196 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
197 next;
198 }
199 my $csv_line_filename = shift(@$csv_line_filename_array);
200 delete $csv_line_metadata{"Filename"};
201
202
203 # Associate the metadata now
204 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
205
206 $extrametadata->{$csv_line_filename} = \%csv_line_metadata;
207 push(@$extrametakeys, $csv_line_filename);
208 # record which file the metadata came from
209 if (!defined $extrametafile->{$csv_line_filename}) {
210 $extrametafile->{$csv_line_filename} = {};
211 }
212 # maps the file to full path
213 $extrametafile->{$csv_line_filename}->{$file} = $filename;
214 }
215}
216
217sub print_error
218{
219
220 my $self = shift(@_);
221 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
222
223 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
224 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
225 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
226}
2271;
Note: See TracBrowser for help on using the repository browser.