source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 37047

Last change on this file since 37047 was 37047, checked in by davidb, 16 months ago

Introduction of 'metadata_separate_fields', a plugin option that controls which fields get the value separation split applied to. By default all fields get split when the value split character is specified, however you can get situations where you want to split on (say) ',' for a Keyword field but not in a Abstract field that happens to use commas

File size: 16.4 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29use extrametautil;
30
31use ReadTextFile;
32use SplitTextFile; # for a couple routines, but we not inheriting
33use MetadataRead;
34use CSVFieldSeparator;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use Text::CSV;
40
41sub BEGIN {
42 @CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43 binmode(STDERR, ":utf8");
44
45}
46
47
48my $arguments =
49 [
50 { 'name' => "process_exp",
51 'desc' => "{BaseImporter.process_exp}",
52 'type' => "regexp",
53 'reqd' => "no",
54 'deft' => &get_default_process_exp() },
55 { 'name' => "filename_field",
56 'desc' => "{CSVPlugin.filename_field}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => "Filename" },
60 { 'name' => "no_document_if_source_unspecified",
61 'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
62 'type' => "flag",
63 'reqd' => "no"},
64 { 'name' => "no_document_if_source_missing",
65 'desc' => "{CSVPlugin.no_document_if_source_missing}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "use_namespace_for_field_names",
69 'desc' => "{CSVPlugin.use_namespace_for_field_names}",
70 'type' => "string",
71 'reqd' => "no"},
72 { 'name' => "store_field_values_as_document_text",
73 'desc' => "{CSVPlugin.store_field_values_as_document_text}",
74 'type' => "flag",
75 'reqd' => "no"},
76
77
78 ];
79
80
81my $options = { 'name' => "CSVPlugin",
82 'desc' => "{CSVPlugin.desc}",
83 'abstract' => "no",
84 'inherits' => "yes",
85 'explodes' => "yes",
86 'args' => $arguments };
87
88
89# This plugin processes files with the suffix ".csv"
90sub get_default_process_exp {
91 return q^(?i)(\.csv)$^;
92}
93
94sub new
95{
96 my ($class) = shift (@_);
97 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
98 push(@$pluginlist, $class);
99
100 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
101 push(@{$hashArgOptLists->{"OptList"}}, $options);
102
103 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
104 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
105
106 $self->{'textcat_store'} = {};
107 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
108
109 if ((defined $self->{'use_namespace_for_field_names'}) && ($self->{'use_namespace_for_field_names'} =~ m/^\s*$/)) {
110 $self->{'use_namespace_for_field_names'} = undef;
111 }
112
113 return bless $self, $class;
114}
115
116
117# mark the file as a metadata file
118sub file_block_read {
119 my $self = shift (@_);
120 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
121
122 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
123 return undef unless $self->can_process_this_file($filename_full_path);
124
125 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
126 # convert to full name - paths stored in block hash are long filenames
127 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
128 }
129# kjdon - upgrade method converts everyhting to lower case drive letter.
130# so would we need the following stuff???
131# my $lower_drive = $filename_full_path;
132# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
133
134# my $upper_drive = $filename_full_path;
135# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
136
137# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
138# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
139
140# }
141# else {
142### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
143 # }
144 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
145 return undef; #1
146}
147
148sub metadata_read
149{
150 my $self = shift (@_);
151 my ($pluginfo, $base_dir, $file, $block_hash,
152 $extrametakeys, $extrametadata, $extrametafile,
153 $processor, $gli, $aux) = @_;
154
155 # can we process this file??
156 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
157 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
158
159 print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
160 print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
161
162 my $outhandle = $self->{'outhandle'};
163 my $failhandle = $self->{'failhandle'};
164 my $verbosity = $self->{'verbosity'};
165
166 # don't add to block list, as we may do some processing in read.
167
168 # Do encoding stuff
169 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
170 if ($self->{'verbosity'} > 2) {
171 print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
172 }
173 # store these values for read
174 my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
175 $self->{'textcat_store'}->{$file} = $le_rec;
176
177 my $metadata_store = {};
178 $self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
179
180 my $CSV_FILE;
181 open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
182 my $separate_char = $self->{'csv_field_separator'};
183
184 my $md_val_sep = $self->{'metadata_value_separator'};
185 undef $md_val_sep if ($md_val_sep eq "");
186
187 my $csv_file_field_line;
188 if ($separate_char =~ m/^auto$/i) {
189
190 $csv_file_field_line = <$CSV_FILE>;
191 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
192 seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
193 }
194
195 my $md_sep_fields = $self->{'metadata_separate_fields'};
196 undef $md_sep_fields if ($md_sep_fields eq "");
197
198 my $md_sep_fields_lookup = undef;
199 if (defined $md_sep_fields) {
200 $md_sep_fields_lookup = {};
201
202 my @md_fields = split(/\s*,\s*/,$md_sep_fields);
203
204 for my $md_field (@md_fields) {
205 $md_sep_fields_lookup->{$md_field} = 1;
206 }
207 }
208
209 my $csv = Text::CSV->new();
210 $csv->sep_char($separate_char);
211 $csv->binary(1);
212
213 my @csv_file_fields = undef;
214
215 my $first_row = $csv->getline ($CSV_FILE);
216 if (defined $first_row) {
217 @csv_file_fields = @$first_row;
218 }
219 else {
220 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
221 return -1;
222 }
223
224 my $found_filename_field = 0;
225 my $filename_field = $self->{'filename_field'};
226 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
227 # Remove any spaces from the field names, and surrounding quotes too
228 $csv_file_fields[$i] =~ s/ //g;
229 $csv_file_fields[$i] =~ s/^"//;
230 $csv_file_fields[$i] =~ s/"$//;
231
232 if ($self->{'use_namespace_for_field_names'}) {
233 $csv_file_fields[$i] = $self->{'use_namespace_for_field_names'}. "." . $csv_file_fields[$i];
234 }
235 if ($csv_file_fields[$i] eq $filename_field) {
236 $found_filename_field = 1;
237 }
238 }
239
240
241 if (!$found_filename_field) {
242 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
243
244 }
245
246 my $count = 0;
247 while (my $csv_line = $csv->getline($CSV_FILE)) {
248 my @md_vals = @$csv_line;
249 $count++;
250
251 # Build a hash of metadata name to metadata value for this line
252 my %csv_line_metadata;
253
254 my $md_vals_len = scalar(@md_vals);
255
256 for (my $i=0; $i<$md_vals_len; $i++) {
257 my $md_val = $md_vals[$i];
258 # Only bother with non-empty values
259 if ($md_val ne "" && defined($csv_file_fields[$i])) {
260
261 my $md_name = $csv_file_fields[$i];
262 $csv_line_metadata{$md_name} = [];
263
264 my $needs_md_val_sep = 0;
265 if (defined $md_val_sep) {
266 # Default coming in is 'no' (0)
267 # => Check to see if any conditions met to turn this into a 'yes' (1)
268
269 # check to see if md_sep_fields is in play, and if it is
270 # => determine if this $md_name is one of the ones in $md_sep_fields_lookup
271
272 if (defined $md_sep_fields_lookup) {
273 if ($md_sep_fields_lookup->{$md_name}) {
274 $needs_md_val_sep = 1;
275 }
276 }
277 else {
278 # if not set, then we apply the md_val_sep to all metadata fields
279 $needs_md_val_sep = 1;
280 }
281 }
282
283 if ($needs_md_val_sep) {
284
285 my @within_md_vals = split(/${md_val_sep}/,$md_val);
286
287 # protect square brackets in metadata values by hex entity encoding them
288 # As unescaped square bracket chars in metadata
289 # have special meaning in GS' Java runtime code
290 my @escaped_within_md_vals = ();
291 for my $meta_value (@within_md_vals) {
292
293 $meta_value =~ s/\[/&\#091;/g;
294 $meta_value =~ s/\]/&\#093;/g;
295 push(@escaped_within_md_vals, $meta_value);
296 }
297 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
298 }
299 else {
300 # protect square brackets in metadata values by hex entity encoding them
301 my $escaped_metadata_value = $md_val;
302 $escaped_metadata_value =~ s/\[/&\#091;/g;
303 $escaped_metadata_value =~ s/\]/&\#093;/g;
304 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
305 }
306 }
307 }
308
309 my $csv_line_section_array = $csv_line_metadata{"Section"};
310 my $section_suffix = "";
311 if (defined $csv_line_section_array) {
312 my $section_value = shift(@$csv_line_section_array);
313 if ($section_value =~ /[\d.]+/m){
314 my $section_suffix = "///Section/" . $section_value;
315 foreach my $metaname (keys %csv_line_metadata) {
316 my $new_name = $metaname . $section_suffix;
317 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
318 }
319 } else{
320 unshift(@$csv_line_section_array, $section_value);
321 }
322 }
323
324 # do we have filename field?
325 # We can't associate any metadata without knowing the file to associate it with
326 my $has_srcdoc = 0;
327 my $missing_srcdoc = 0;
328 my $csv_line_filename="";;
329 if ($found_filename_field) {
330 # is there a srcdoc mentioned?
331 my $csv_line_filename_array = $csv_line_metadata{$filename_field};
332 if (!defined $csv_line_filename_array) {
333 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
334 } else {
335 $csv_line_filename = shift(@$csv_line_filename_array);
336 # TODO - have an option for whether we do this or not
337 if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($base_dir, $csv_line_filename))) {
338 $has_srcdoc = 1;
339
340 delete $csv_line_metadata{$filename_field};
341 } else {
342 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
343 $missing_srcdoc = 1; # there was one mentioned but its not found
344 }
345 }
346
347 }
348 if ($has_srcdoc) {
349 print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
350 $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
351 } else {
352 my $store_for_dummy = 1;
353 if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
354 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
355 $store_for_dummy = 0;
356 } elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
357 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
358 $store_for_dummy = 0;
359 }
360 if ($store_for_dummy) {
361
362 print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
363 $metadata_store->{$count} = \%csv_line_metadata;
364 }
365 }
366 } # while csv_line = csv->getline
367 close ($CSV_FILE);
368}
369
370#adapted from read in splittextfile
371sub read {
372 my $self = shift (@_);
373 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
374 my $outhandle = $self->{'outhandle'};
375 my $verbosity = $self->{'verbosity'};
376
377 # can we process this file??
378 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
379 return undef unless $self->can_process_this_file($filename_full_path);
380
381 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
382
383 my $le_rec = $self->{'textcat_store'}->{$file};
384 if (!defined $le_rec) {
385 # means no text was found;
386 return 0; # not processed but no point in passing it on
387 }
388
389 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
390 print $outhandle "$self->{'plugin_type'} processing $file\n"
391 if $self->{'verbosity'} > 1;
392
393 my $language = $le_rec->{'language'};
394 my $encoding = $le_rec->{'encoding'};
395 $self->{'textcat_store'}->{$file} = undef;
396
397 my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
398
399
400 # Process each segment in turn
401 my $segment = 0; #which segment/record number we have
402 my $count = 0; # num doc objs produced
403
404 my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
405 my $plugin_filename_encoding = $self->{'filename_encoding'};
406 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
407
408 my $id;
409
410 foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
411 print $outhandle "processing segment $segment as its own document\n"
412 if $self->{'verbosity'} > 1;
413 $count++;
414 # create a new document
415 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
416 my $cursection = $doc_obj->get_top_section();
417 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
418 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
419
420 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
421
422 $doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
423 if ($self->{'cover_image'}) {
424 $self->associate_cover_image($doc_obj, $filename_full_path);
425 }
426 $doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");
427
428 # include any metadata passed in from previous plugins
429 # note that this metadata is associated with the top level section
430 $self->extra_metadata ($doc_obj, $cursection, $metadata);
431
432 # add our stored metadata from metadata_read pass
433 my $segment_metadata = $metadata_store->{$segment};
434 $self->extra_metadata($doc_obj, $cursection, $segment_metadata);
435 if ($self->{'store_field_values_as_document_text'}) {
436 my $new_text = "";
437 foreach my $f (keys %$segment_metadata) {
438 my $values = $segment_metadata->{$f};
439 $new_text .= join (", ", @$values).", ";
440 }
441
442 $doc_obj->add_utf8_text($cursection, $new_text);
443 }
444 # do any automatic metadata extraction - does this make sense??
445 #$self->auto_extract_metadata ($doc_obj);
446
447 # Calculate a "base" document ID.
448 if (!defined $id) {
449 $id = &SplitTextFile::get_base_OID($self,$doc_obj);
450 }
451
452 # add an OID
453 &SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
454
455 # process the document
456 $processor->process($doc_obj);
457
458 $self->{'num_processed'} ++;
459 if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
460 last;
461 }
462 }
463
464 delete $self->{'metadata_store'}->{$file};
465
466 # Return number of document objects produced
467 return $count;
468}
469
470sub print_warning {
471 my $self = shift(@_);
472 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
473
474 print $outhandle "CSVPlugin Warning: $file: $error\n";
475 print $failhandle "CSVPlugin Warning: $file: $error\n";
476 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
477
478}
479sub print_error
480{
481
482 my $self = shift(@_);
483 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
484
485 print $outhandle "CSVPlugin Error: $file: $error\n";
486 print $failhandle "CSVPlugin Error: $file: $error\n";
487 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
488}
489
490
4911;
Note: See TracBrowser for help on using the repository browser.