source: main/trunk/greenstone2/perllib/plugins/MetadataCSVPlugin.pm@ 22705

Last change on this file since 22705 was 20804, checked in by kjdon, 15 years ago

adding back in the code to store which file metaata came from. used later to add metadata files into reverse lookup database - otherwise we can't tell which files are new, which are existing

  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1###########################################################################
2#
3# MetadataCSVPlugin.pm -- A plugin for metadata in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package MetadataCSVPlugin;
28
29
30use BasePlugin;
31use strict;
32
33use multiread;
34
35
36sub BEGIN {
37 @MetadataCSVPlugin::ISA = ('BasePlugin');
38}
39
40
41my $arguments = [
42 { 'name' => "process_exp",
43 'desc' => "{BasePlugin.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() }
47
48];
49
50
51my $options = { 'name' => "MetadataCSVPlugin",
52 'desc' => "{MetadataCSVPlugin.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57
58sub new
59{
60 my ($class) = shift (@_);
61 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62 push(@$pluginlist, $class);
63
64 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65 push(@{$hashArgOptLists->{"OptList"}},$options);
66
67 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
68
69 return bless $self, $class;
70}
71
72
73sub get_default_process_exp
74{
75 return q^(?i)\.csv$^;
76}
77
78sub file_block_read {
79 my $self = shift (@_);
80 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
81
82 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
83
84 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
85 return undef; # can't recognise
86 }
87
88 # set this so we know this is a metadata file - needed for incremental
89 # build
90 # if this file changes, then we need to reimport everything
91 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
92
93 return 1;
94}
95
96sub metadata_read
97{
98 my $self = shift (@_);
99 my ($pluginfo, $base_dir, $file, $block_hash,
100 $extrametakeys, $extrametadata, $extrametafile,
101 $processor, $maxdocs, $gli) = @_;
102
103 # Read metadata from CSV files
104 my $filename = &util::filename_cat($base_dir, $file);
105 if ($filename !~ /\.csv$/ || !-f $filename) {
106 return undef;
107 }
108 print STDERR "\n<Processing n='$file' p='MetadataCSVPlugin'>\n" if ($gli);
109 print STDERR "MetadataCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
110
111 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
112 $block_hash->{'file_blocks'}->{$filename} = 1;
113
114 # Read the CSV file to get the metadata
115 my $csv_file_content;
116 open(CSV_FILE, "$filename");
117 my $csv_file_reader = new multiread();
118 $csv_file_reader->set_handle('MetadataCSVPlugin::CSV_FILE');
119 $csv_file_reader->read_file(\$csv_file_content);
120 close(CSV_FILE);
121
122 # Split the file into lines and read the first line (contains the metadata names)
123 $csv_file_content =~ s/\r/\n/g; # Handle non-Unix line endings
124 $csv_file_content =~ s/\n+/\n/g;
125 my @csv_file_lines = split(/\n/, $csv_file_content);
126 my $csv_file_field_line = shift(@csv_file_lines);
127 my @csv_file_fields = split(/\,/, $csv_file_field_line);
128 my $found_filename_field = 0;
129 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
130 # Remove any spaces from the field names
131 $csv_file_fields[$i] =~ s/ //g;
132 if ($csv_file_fields[$i] eq "Filename") {
133 $found_filename_field = 1;
134 }
135 }
136
137 if (!$found_filename_field) {
138 print STDERR "MetadataCSVPlugin Error: No Filename field in CSV file: $filename\n";
139 return -1; # error
140 }
141 # Read each line of the file and assign the metadata appropriately
142 foreach my $csv_line (@csv_file_lines) {
143 # Ignore lines containing only whitespace
144 next if ($csv_line =~ /^\s*$/);
145 my $orig_csv_line = $csv_line;
146 # Build a hash of metadata name to metadata value for this line
147 my %csv_line_metadata;
148 my $i = 0;
149 $csv_line .= ","; # To make the regular expressions simpler
150 while ($csv_line ne "") {
151 # Metadata values containing commas are quoted
152 if ($csv_line =~ s/^\"(.*?)\"\,//) {
153 # Only bother with non-empty values
154 if ($1 ne "" && defined($csv_file_fields[$i])) {
155 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
156 $csv_line_metadata{$csv_file_fields[$i]} = [];
157 }
158 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
159 }
160 }
161 # Normal comma-separated case
162 elsif ($csv_line =~ s/^(.*?)\,//) {
163 # Only bother with non-empty values
164 if ($1 ne "" && defined($csv_file_fields[$i])) {
165 if (!defined $csv_line_metadata{$csv_file_fields[$i]}) {
166 $csv_line_metadata{$csv_file_fields[$i]} = [];
167 }
168 push (@{$csv_line_metadata{$csv_file_fields[$i]}}, $1);
169 }
170 }
171 # The line must be formatted incorrectly
172 else {
173 print STDERR "MetadataCSVPlugin Error: Badly formatted CSV line: $csv_line.\n";
174 last;
175 }
176
177 $i++;
178 }
179
180 # We can't associate any metadata without knowing the file to associate it with
181 my $csv_line_filename_array = $csv_line_metadata{"Filename"};
182 if (!defined $csv_line_filename_array) {
183 print STDERR "MetadataCSVPlugin Error: No Filename metadata in CSV line: $orig_csv_line\n";
184 next;
185 }
186 my $csv_line_filename = shift(@$csv_line_filename_array);
187 delete $csv_line_metadata{"Filename"};
188
189
190 # Associate the metadata now
191 $csv_line_filename = &util::filename_to_regex($csv_line_filename);
192
193 $extrametadata->{$csv_line_filename} = \%csv_line_metadata;
194 push(@$extrametakeys, $csv_line_filename);
195 # record which file the metadata came from
196 if (!defined $extrametafile->{$csv_line_filename}) {
197 $extrametafile->{$csv_line_filename} = {};
198 }
199 # maps the file to full path
200 $extrametafile->{$csv_line_filename}->{$file} = $filename;
201 }
202}
203
204
2051;
Note: See TracBrowser for help on using the repository browser.