source: main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm@ 27321

Last change on this file since 27321 was 27321, checked in by ak19, 8 years ago

Two bugfixes: 1. Handling of quotes not just the CSV fields containing commas, but around all CSV fields, can happen when a CSV file is exported from OpenOffice's Calc spreadsheet program. 2. The second bug was when 2 pdfs, called one.pdf and two.pdf have metadata assigned in meta.csv. Then the metadata gets duplicated for two.pdf (2 dc.Title, 2 dc.Author). If the 2 pdfs were called 1.pdf and 2.pdf, the meta was duplicated for both files. Thanks to Kathy who found that this had something to do with the order of the documents and meta.csv getting processed when the EmbeddedMetadataPlugin was also in the list. She also found a different bug: that while EmbeddedMetaPlug merged its own extrameta with existing extrameta, MetaCSVPlug did not merge but overwrote all meta with its own. After adding in merging of extrameta into MetaCSVPlug, the initial bug of duplicate assignment of the meta in the CSV file was resolved too.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BasePlugin;
31use MetadataRead;
32
33use strict;
34no strict 'refs';
35
36use extrametautil;
37use multiread;
38use util;
39
40use Encode;
41
42# methods with identical signatures take precedence in the order given in the ISA list.
43sub BEGIN {
44 @MetadataCSVPlugin::ISA = ('MetadataRead', 'BasePlugin');
45}
46
47
48my $arguments = [
49 { 'name' => "process_exp",
50 'desc' => "{BasePlugin.process_exp}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => &get_default_process_exp() }
54
55];
56
57
58my $options = { 'name' => "MetadataCSVPlugin",
59 'desc' => "{MetadataCSVPlugin.desc}",
60 'abstract' => "no",
61 'inherits' => "yes",
62 'args' => $arguments };
63
64
65sub new
66{
67 my ($class) = shift (@_);
68 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
69 push(@$pluginlist, $class);
70
71 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
72 push(@{$hashArgOptLists->{"OptList"}},$options);
73
74 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
75
76 return bless $self, $class;
77}
78
79
80sub get_default_process_exp
81{
82 return q^(?i)\.csv$^;
83}
84
85sub file_block_read {
86 my $self = shift (@_);
87 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
88
89 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
90
91 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
92 return undef; # can't recognise
93 }
94
95 # set this so we know this is a metadata file - needed for incremental
96 # build
97 # if this file changes, then we need to reimport everything
98 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
99
100 return 1;
101}
102
103sub metadata_read
104{
105 my $self = shift (@_);
106 my ($pluginfo, $base_dir, $file, $block_hash,
107 $extrametakeys, $extrametadata, $extrametafile,
108 $processor, $gli, $aux) = @_;
109
110 # Read metadata from CSV files
111 my $filename = &util::filename_cat($base_dir, $file);
112 if ($filename !~ /\.csv$/ || !-f $filename) {
113 return undef;
114 }
115 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
116 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
117
118 my $outhandle = $self->{'outhandle'};
119 my $failhandle = $self->{'failhandle'};
120
121 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
122 &util::block_filename($block_hash,$filename);
123
124
125 # Read the CSV file to get the metadata
126 my $csv_file_content;
127 open(CSV_FILE, "$filename");
128 my $csv_file_reader = new multiread();
129 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
130 $csv_file_reader->read_file(\$csv_file_content);
131
132 # Would be nice if MetadataCSVPlugin was extended to support a minus
133 # option to choose the character encoding the CSV file is in
134 # For now we will assume it is always in UTF8
135 $csv_file_content = decode("utf8",$csv_file_content);
136
137 close(CSV_FILE);
138
139 # Split the file into lines and read the first line (contains the metadata names)
140 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
141 $csv_file_content =~ s/\n+/\n/g;
142 my @csv_file_lines = split(/\n/, $csv_file_content);
143 my $csv_file_field_line = shift(@csv_file_lines);
144 my @csv_file_fields = split(/\,/, $csv_file_field_line);
145 my $found_filename_field = 0;
146 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
147 # Remove any spaces from the field names, and surrounding quotes too
148 $csv_file_fields[$i] =~ s/ //g;
149 $csv_file_fields[$i] =~ s/^"//;
150 $csv_file_fields[$i] =~ s/"$//;
151
152 if ($csv_file_fields[$i] eq "Filename") {
153 $found_filename_field = 1;
154 }
155 }
156
157 if (!$found_filename_field) {
158 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename field in CSV file");
159 return -1; # error
160 }
161 # Read each line of the file and assign the metadata appropriately
162 foreach my $csv_line (@csv_file_lines) {
163 # Ignore lines containing only whitespace
164 next if ($csv_line =~ /^\s*$/);
165 my $orig_csv_line = $csv_line;
166 # Build a hash of metadata name to metadata value for this line
167 my %csv_line_metadata;
168 my $i = 0;
169 $csv_line .= ","; # To make the regular expressions simpler
170 while ($csv_line ne "") {
171 # Metadata values containing commas are quoted
172 if ($csv_line =~ s/^\"(.*?)\"\,//) {
173 # Only bother with non-empty values
174 if ($1 ne "" && defined($csv_file_fields[$i])) {
175 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
176 $csv_line_metadata{$csv_file_fields[$i]} = [];
177 }
178 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
179 }
180 }
181 # Normal comma-separated case
182 elsif ($csv_line =~ s/^(.*?)\,//) {
183 # Only bother with non-empty values
184 if ($1 ne "" && defined($csv_file_fields[$i])) {
185 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
186 $csv_line_metadata{$csv_file_fields[$i]} = [];
187 }
188 # remove any surrounding quotes. (When exporting to CSV, some spreadsheet
189 # programs add quotes even around field values that don't contain commas.)
190 my $value = $1;
191 $value =~ s/^"//;
192 $value =~ s/"$//;
193 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $value);
194 }
195 }
196 # The line must be formatted incorrectly
197 else {
198 $self->print_error($outhandle, $failhandle, $gli, $filename, "Badly formatted CSV line: $csv_line");
199 last;
200 }
201
202 $i++;
203 }
204
205 # We can't associate any metadata without knowing the file to associate it with
206 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
207 if (!defined $csv_line_filename_array) {
208 $self->print_error($outhandle, $failhandle, $gli, $filename, "No Filename metadata in CSV line: $orig_csv_line");
209 next;
210 }
211 my $csv_line_filename = shift(@$csv_line_filename_array);
212 delete $csv_line_metadata{"Filename"};
213
214
215 # Associate the metadata now
216 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
217 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
218 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
219 $csv_line_filename = &util::filepath_to_url_format($csv_line_filename);
220 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
221
222 if (defined &extrametautil::getmetadata($extrametadata, $csv_line_filename)) { # merge with existing meta
223
224 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $csv_line_filename);
225
226 foreach my $metaname (keys %csv_line_metadata) {
227 # will create new entry if one does not already exist
228 push(@{$file_metadata_table->{$metaname}}, @{$csv_line_metadata{$metaname}});
229 }
230
231 # no need to push $file on to $extrametakeys as it is already in the list
232 } else { # add as new meta
233
234 &extrametautil::setmetadata($extrametadata, $csv_line_filename, \%csv_line_metadata);
235 &extrametautil::addmetakey($extrametakeys, $csv_line_filename);
236 }
237 # record which file the metadata came from
238 if (!defined &extrametautil::getmetafile($extrametafile, $csv_line_filename)) {
239 &extrametautil::setmetafile($extrametafile, $csv_line_filename, {});
240 }
241 # maps the file to full path
242 &extrametautil::setmetafile_for_named_file($extrametafile, $csv_line_filename, $file, $filename);
243 }
244}
245
246sub print_error
247{
248
249 my $self = shift(@_);
250 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
251
252 print $outhandle "MetadataCSVPlugin Error: $file: $error\n";
253 print $failhandle "MetadataCSVPlugin Error: $file: $error\n";
254 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
255}
2561;
Note: See TracBrowser for help on using the repository browser.