source: main/trunk/greenstone2/perllib/plugins/MetadataCSVDeprecatedPlugin.pm@ 36479

Last change on this file since 36479 was 36479, checked in by kjdon, 20 months ago

renaming the old CSVPlugin and MetadataCSVPlugin to Deprecated versions, prior to adding the new CSVPlugin which handles both cases, and is new and improved.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.0 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BaseImporter;
31use MetadataRead;
32use CSVFieldSeparator;
33
34use strict;
35no strict 'refs';
36
37use multiread;
38use util;
39
40use Encode;
41use Text::CSV;
42
43# methods with identical signatures take precedence in the order given in the ISA list.
44sub BEGIN {
45 @MetadataCSVPlugin::ISA = ('MetadataRead', 'BaseImporter', 'CSVFieldSeparator');
46}
47
48
49
50my $arguments = [
51 { 'name' => "process_exp",
52 'desc' => "{BaseImporter.process_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_process_exp() }
56
57];
58
59
60my $options = { 'name' => "MetadataCSVPlugin",
61 'desc' => "{MetadataCSVPlugin.desc}",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'args' => $arguments };
65
66
67sub new
68{
69 my ($class) = shift (@_);
70 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
71 push(@$pluginlist, $class);
72
73 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
74 push(@{$hashArgOptLists->{"OptList"}},$options);
75
76 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
77 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
78
79 return bless $self, $class;
80}
81
82
83sub get_default_process_exp
84{
85 return q^(?i)\.csv$^;
86}
87
88sub file_block_read {
89 my $self = shift (@_);
90 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
91
92 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
93
94 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
95 return undef; # can't recognise
96 }
97
98 # set this so we know this is a metadata file - needed for incremental
99 # build
100 # if this file changes, then we need to reimport everything
101 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
102
103 return 1;
104}
105
106sub metadata_read
107{
108 my $self = shift (@_);
109 my ($pluginfo, $base_dir, $file, $block_hash,
110 $extrametakeys, $extrametadata, $extrametafile,
111 $processor, $gli, $aux) = @_;
112
113 # can we process this file??
114 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
115 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
116
117 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
118 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
119
120 my $outhandle = $self->{'outhandle'};
121 my $failhandle = $self->{'failhandle'};
122
123 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
124 $self->block_raw_filename($block_hash,$filename_full_path);
125
126
127 # Read the CSV file to get the metadata
128 my $csv_file_content;
129 open(CSV_FILE, "$filename_full_path");
130 my $csv_file_reader = new multiread();
131 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
132 $csv_file_reader->read_file(\$csv_file_content);
133
134 # Would be nice if MetadataCSVPlugin was extended to support a minus
135 # option to choose the character encoding the CSV file is in
136 # For now we will assume it is always in UTF8
137 $csv_file_content = decode("utf8",$csv_file_content);
138
139 close(CSV_FILE);
140
141 # Split the file into lines and read the first line (contains the metadata names)
142 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
143 $csv_file_content =~ s/\n+/\n/g;
144
145 my $separate_char = $self->{'csv_field_separator'};
146
147 my $md_val_sep = $self->{'metadata_value_separator'};
148 undef $md_val_sep if ($md_val_sep eq "");
149
150 my @csv_file_lines = split(/\n/, $csv_file_content);
151 my $csv_file_field_line = shift(@csv_file_lines);
152
153 if ($separate_char =~ m/^auto$/i) {
154 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
155 }
156
157 my $csv = Text::CSV->new();
158 $csv->sep_char($separate_char);
159
160 my @csv_file_fields = undef;
161 if ($csv->parse($csv_file_field_line)) {
162 @csv_file_fields = $csv->fields;
163 }
164 else {
165 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
166 return -1;
167 }
168
169 my $found_filename_field = 0;
170 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
171 # Remove any spaces from the field names, and surrounding quotes too
172 $csv_file_fields[$i] =~ s/ //g;
173 $csv_file_fields[$i] =~ s/^"//;
174 $csv_file_fields[$i] =~ s/"$//;
175
176 if ($csv_file_fields[$i] eq "Filename") {
177 $found_filename_field = 1;
178 }
179 }
180
181 if (!$found_filename_field) {
182 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename field in CSV file");
183 return -1; # error
184 }
185 # Read each line of the file and assign the metadata appropriately
186 foreach my $csv_line (@csv_file_lines) {
187 # Ignore lines containing only whitespace
188 next if ($csv_line =~ /^\s*$/);
189 my $orig_csv_line = $csv_line;
190
191 # Build a hash of metadata name to metadata value for this line
192 my %csv_line_metadata;
193
194 if ($csv->parse($csv_line)) {
195 my @md_vals = $csv->fields;
196 my $md_vals_len = scalar(@md_vals);
197
198 for (my $i=0; $i<$md_vals_len; $i++) {
199 my $md_val = $md_vals[$i];
200 # Only bother with non-empty values
201 if ($md_val ne "" && defined($csv_file_fields[$i])) {
202
203 my $md_name = $csv_file_fields[$i];
204
205 if (!defined $md_name) {
206 $csv_line_metadata{$md_name} = [];
207 }
208
209 if (defined $md_val_sep) {
210
211 my @within_md_vals = split(/${md_val_sep}/,$md_val);
212 #push (@{$csv_line_metadata{$md_name}}, @within_md_vals);
213
214 # protect square brackets in metadata values by hex entity encoding them
215 # As unescaped square bracket chars in metadata
216 # have special meaning in GS' Java runtime code
217 my @escaped_within_md_vals = ();
218 for my $meta_value (@within_md_vals) {
219 $meta_value =~ s/\[/&\#091;/g;
220 $meta_value =~ s/\]/&\#093;/g;
221 push(@escaped_within_md_vals, $meta_value);
222 }
223 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
224
225# foreach my $within_md_val (@within_md_vals) {
226# push (@{$csv_line_metadata{$md_name}}, $within_md_val);
227# }
228 }
229 else {
230 #push (@{$csv_line_metadata{$md_name}}, $md_val);
231 # protect square brackets in metadata values by hex entity encoding them
232 my $escaped_metadata_value = $md_val;
233 $escaped_metadata_value =~ s/\[/&\#091;/g;
234 $escaped_metadata_value =~ s/\]/&\#093;/g;
235 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
236 }
237 }
238 }
239 }
240 else {
241 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Badly formatted CSV line: $csv_line");
242 last;
243 }
244
245 # We can't associate any metadata without knowing the file to associate it with
246 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
247 if (!defined $csv_line_filename_array) {
248 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No Filename metadata in CSV line: $orig_csv_line");
249 next;
250 }
251 my $csv_line_filename = shift(@$csv_line_filename_array);
252 delete $csv_line_metadata{"Filename"};
253
254 my $csv_line_section_array = $csv_line_metadata{"Section"};
255 my $section_suffix = "";
256 if (defined $csv_line_section_array) {
257 my $section_value = shift(@$csv_line_section_array);
258 if ($section_value =~ /[\d.]+/m){
259 my $section_suffix = "///Section/" . $section_value;
260 foreach my $metaname (keys %csv_line_metadata) {
261 my $new_name = $metaname . $section_suffix;
262 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
263 }
264 } else{
265 unshift(@$csv_line_section_array, $section_value);
266 }
267 }
268
269
270 # Associate the metadata now
271 $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
272
273 }
274}
275
276sub print_error
277{
278
279 my $self = shift(@_);
280 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
281
282 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
283 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
284 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
285}
2861;
Note: See TracBrowser for help on using the repository browser.