source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm

Last change on this file was 38797, checked in by kjdon, 8 weeks ago

added option '-ignore_field' - if there is a column with this name in the spreadsheet, and a line is non empty for this column, then ignore the line.

File size: 17.0 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29use extrametautil;
30
31use ReadTextFile;
32use SplitTextFile; # for a couple routines, but we not inheriting
33use MetadataRead;
34use CSVFieldSeparator;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use Text::CSV;
40
41sub BEGIN {
42 @CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43 binmode(STDERR, ":utf8");
44
45}
46
47
48my $arguments =
49 [
50 { 'name' => "process_exp",
51 'desc' => "{BaseImporter.process_exp}",
52 'type' => "regexp",
53 'reqd' => "no",
54 'deft' => &get_default_process_exp() },
55 { 'name' => "filename_field",
56 'desc' => "{CSVPlugin.filename_field}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => "Filename" },
60 { 'name' => "no_document_if_source_unspecified",
61 'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
62 'type' => "flag",
63 'reqd' => "no"},
64 { 'name' => "no_document_if_source_missing",
65 'desc' => "{CSVPlugin.no_document_if_source_missing}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "use_namespace_for_field_names",
69 'desc' => "{CSVPlugin.use_namespace_for_field_names}",
70 'type' => "string",
71 'reqd' => "no"},
72 { 'name' => "store_field_values_as_document_text",
73 'desc' => "{CSVPlugin.store_field_values_as_document_text}",
74 'type' => "flag",
75 'reqd' => "no"},
76 { 'name' => "ignore_field",
77 'desc' => "{CSVPlugin.ignore_field}",
78 'type' => "string",
79 'reqd' => "no"},
80
81
82 ];
83
84
85my $options = { 'name' => "CSVPlugin",
86 'desc' => "{CSVPlugin.desc}",
87 'abstract' => "no",
88 'inherits' => "yes",
89 'explodes' => "yes",
90 'args' => $arguments };
91
92
93# This plugin processes files with the suffix ".csv"
94sub get_default_process_exp {
95 return q^(?i)(\.csv)$^;
96}
97
98sub new
99{
100 my ($class) = shift (@_);
101 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
102 push(@$pluginlist, $class);
103
104 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
105 push(@{$hashArgOptLists->{"OptList"}}, $options);
106
107 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
108 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
109
110 $self->{'textcat_store'} = {};
111 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
112
113 if ((defined $self->{'use_namespace_for_field_names'}) && ($self->{'use_namespace_for_field_names'} =~ m/^\s*$/)) {
114 $self->{'use_namespace_for_field_names'} = undef;
115 }
116 if ((defined $self->{'ignore_field'}) && ($self->{'ignore_field'} =~ m/^\s*$/)) {
117 $self->{'ignore_field'} = undef;
118 }
119
120 return bless $self, $class;
121}
122
123
124# mark the file as a metadata file
125sub file_block_read {
126 my $self = shift (@_);
127 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
128
129 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
130 return undef unless $self->can_process_this_file($filename_full_path);
131
132 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
133 # convert to full name - paths stored in block hash are long filenames
134 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
135 }
136# kjdon - upgrade method converts everyhting to lower case drive letter.
137# so would we need the following stuff???
138# my $lower_drive = $filename_full_path;
139# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
140
141# my $upper_drive = $filename_full_path;
142# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
143
144# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
145# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
146
147# }
148# else {
149### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
150 # }
151 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
152 return undef; #1
153}
154
155sub metadata_read
156{
157 my $self = shift (@_);
158 my ($pluginfo, $base_dir, $file, $block_hash,
159 $extrametakeys, $extrametadata, $extrametafile,
160 $processor, $gli, $aux) = @_;
161
162 # can we process this file??
163 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
164 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
165 # the current directory
166 my $current_dir = &util::filename_head($filename_full_path);
167
168 print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
169 print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
170
171 my $outhandle = $self->{'outhandle'};
172 my $failhandle = $self->{'failhandle'};
173 my $verbosity = $self->{'verbosity'};
174
175 # don't add to block list, as we may do some processing in read.
176
177 # Do encoding stuff
178 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
179 if ($self->{'verbosity'} > 2) {
180 print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
181 }
182 # store these values for read
183 my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
184 $self->{'textcat_store'}->{$file} = $le_rec;
185
186 my $metadata_store = {};
187 $self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
188
189 my $CSV_FILE;
190 open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
191 my $separate_char = $self->{'csv_field_separator'};
192
193 my $md_val_sep = $self->{'metadata_value_separator'};
194 undef $md_val_sep if ($md_val_sep eq "");
195
196 my $csv_file_field_line;
197 if ($separate_char =~ m/^auto$/i) {
198
199 $csv_file_field_line = <$CSV_FILE>;
200 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
201 seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
202 }
203
204 my $md_sep_fields = $self->{'metadata_separate_fields'};
205 undef $md_sep_fields if ($md_sep_fields eq "");
206
207 my $md_sep_fields_lookup = undef;
208 if (defined $md_sep_fields) {
209 $md_sep_fields_lookup = {};
210
211 my @md_fields = split(/\s*,\s*/,$md_sep_fields);
212
213 for my $md_field (@md_fields) {
214 $md_sep_fields_lookup->{$md_field} = 1;
215 }
216 }
217
218 my $csv = Text::CSV->new();
219 $csv->sep_char($separate_char);
220 $csv->binary(1);
221
222 my @csv_file_fields = undef;
223
224 my $first_row = $csv->getline ($CSV_FILE);
225 if (defined $first_row) {
226 @csv_file_fields = @$first_row;
227 }
228 else {
229 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
230 return -1;
231 }
232
233 my $found_filename_field = 0;
234 my $filename_field = $self->{'filename_field'};
235 my $ignore_field = $self->{'ignore_field'};
236 my $ignore_col;
237 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
238 # Remove any spaces from the field names, and surrounding quotes too
239 $csv_file_fields[$i] =~ s/ //g;
240 $csv_file_fields[$i] =~ s/^"//;
241 $csv_file_fields[$i] =~ s/"$//;
242
243
244 if ($self->{'use_namespace_for_field_names'}) {
245 $csv_file_fields[$i] = $self->{'use_namespace_for_field_names'}. "." . $csv_file_fields[$i];
246 }
247 if ($csv_file_fields[$i] eq $filename_field) {
248 $found_filename_field = 1;
249 }
250 if ($ignore_field && $csv_file_fields[$i] eq $ignore_field) {
251 $ignore_col = $i;
252 }
253
254 }
255
256
257 if (!$found_filename_field) {
258 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
259
260 }
261
262
263
264 my $count = 0;
265 while (my $csv_line = $csv->getline($CSV_FILE)) {
266 my @md_vals = @$csv_line;
267
268 if (defined $ignore_col && $md_vals[$ignore_col] ne "") {
269 # ignore this line
270 print STDERR "ignoring line ".join(",", @md_vals)."\n";
271 next;
272 }
273 $count++;
274
275 # Build a hash of metadata name to metadata value for this line
276 my %csv_line_metadata;
277
278 my $md_vals_len = scalar(@md_vals);
279
280 for (my $i=0; $i<$md_vals_len; $i++) {
281 my $md_val = $md_vals[$i];
282 # Only bother with non-empty values
283 if ($md_val ne "" && defined($csv_file_fields[$i])) {
284
285 my $md_name = $csv_file_fields[$i];
286 $csv_line_metadata{$md_name} = [];
287
288 my $needs_md_val_sep = 0;
289 if (defined $md_val_sep) {
290 # Default coming in is 'no' (0)
291 # => Check to see if any conditions met to turn this into a 'yes' (1)
292
293 # check to see if md_sep_fields is in play, and if it is
294 # => determine if this $md_name is one of the ones in $md_sep_fields_lookup
295
296 if (defined $md_sep_fields_lookup) {
297 if ($md_sep_fields_lookup->{$md_name}) {
298 $needs_md_val_sep = 1;
299 }
300 }
301 else {
302 # if not set, then we apply the md_val_sep to all metadata fields
303 $needs_md_val_sep = 1;
304 }
305 }
306
307 if ($needs_md_val_sep) {
308
309 my @within_md_vals = split(/${md_val_sep}/,$md_val);
310
311 # protect square brackets in metadata values by hex entity encoding them
312 # As unescaped square bracket chars in metadata
313 # have special meaning in GS' Java runtime code
314 my @escaped_within_md_vals = ();
315 for my $meta_value (@within_md_vals) {
316
317 $meta_value =~ s/\[/&\#091;/g;
318 $meta_value =~ s/\]/&\#093;/g;
319 push(@escaped_within_md_vals, $meta_value);
320 }
321 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
322 }
323 else {
324 # protect square brackets in metadata values by hex entity encoding them
325 my $escaped_metadata_value = $md_val;
326 $escaped_metadata_value =~ s/\[/&\#091;/g;
327 $escaped_metadata_value =~ s/\]/&\#093;/g;
328 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
329 }
330 }
331 }
332
333 my $csv_line_section_array = $csv_line_metadata{"Section"};
334 my $section_suffix = "";
335 if (defined $csv_line_section_array) {
336 my $section_value = shift(@$csv_line_section_array);
337 if ($section_value =~ /[\d.]+/m){
338 my $section_suffix = "///Section/" . $section_value;
339 foreach my $metaname (keys %csv_line_metadata) {
340 my $new_name = $metaname . $section_suffix;
341 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
342 }
343 } else{
344 unshift(@$csv_line_section_array, $section_value);
345 }
346 }
347
348 # do we have filename field?
349 # We can't associate any metadata without knowing the file to associate it with
350 my $has_srcdoc = 0;
351 my $missing_srcdoc = 0;
352 my $csv_line_filename="";;
353 if ($found_filename_field) {
354 # is there a srcdoc mentioned?
355 my $csv_line_filename_array = $csv_line_metadata{$filename_field};
356 if (!defined $csv_line_filename_array) {
357 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
358 } else {
359 $csv_line_filename = shift(@$csv_line_filename_array);
360 # TODO - have an option for whether we do this or not
361 if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($current_dir, $csv_line_filename))) {
362 $has_srcdoc = 1;
363
364 delete $csv_line_metadata{$filename_field};
365 } else {
366 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
367 $missing_srcdoc = 1; # there was one mentioned but its not found
368 }
369 }
370
371 }
372 if ($has_srcdoc) {
373 print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
374 $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
375 } else {
376 my $store_for_dummy = 1;
377 if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
378 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
379 $store_for_dummy = 0;
380 } elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
381 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
382 $store_for_dummy = 0;
383 }
384 if ($store_for_dummy) {
385
386 print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
387 $metadata_store->{$count} = \%csv_line_metadata;
388 }
389 }
390 } # while csv_line = csv->getline
391 close ($CSV_FILE);
392}
393
394#adapted from read in splittextfile
395sub read {
396 my $self = shift (@_);
397 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
398 my $outhandle = $self->{'outhandle'};
399 my $verbosity = $self->{'verbosity'};
400
401 # can we process this file??
402 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
403 return undef unless $self->can_process_this_file($filename_full_path);
404
405 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
406
407 my $le_rec = $self->{'textcat_store'}->{$file};
408 if (!defined $le_rec) {
409 # means no text was found;
410 return 0; # not processed but no point in passing it on
411 }
412
413 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
414 print $outhandle "$self->{'plugin_type'} processing $file\n"
415 if $self->{'verbosity'} > 1;
416
417 my $language = $le_rec->{'language'};
418 my $encoding = $le_rec->{'encoding'};
419 $self->{'textcat_store'}->{$file} = undef;
420
421 my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
422
423
424 # Process each segment in turn
425 my $segment = 0; #which segment/record number we have
426 my $count = 0; # num doc objs produced
427
428 my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
429 my $plugin_filename_encoding = $self->{'filename_encoding'};
430 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
431
432 my $id;
433
434 foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
435 print $outhandle "processing segment $segment as its own document\n"
436 if $self->{'verbosity'} > 1;
437 $count++;
438 # create a new document
439 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
440 my $cursection = $doc_obj->get_top_section();
441 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
442 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
443
444 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
445
446 $doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
447 if ($self->{'cover_image'}) {
448 $self->associate_cover_image($doc_obj, $filename_full_path);
449 }
450 $doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");
451
452 # include any metadata passed in from previous plugins
453 # note that this metadata is associated with the top level section
454 $self->extra_metadata ($doc_obj, $cursection, $metadata);
455
456 # add our stored metadata from metadata_read pass
457 my $segment_metadata = $metadata_store->{$segment};
458 $self->extra_metadata($doc_obj, $cursection, $segment_metadata);
459 if ($self->{'store_field_values_as_document_text'}) {
460 my $new_text = "";
461 foreach my $f (keys %$segment_metadata) {
462 my $values = $segment_metadata->{$f};
463 $new_text .= join (", ", @$values).", ";
464 }
465
466 $doc_obj->add_utf8_text($cursection, $new_text);
467 }
468 # do any automatic metadata extraction - does this make sense??
469 #$self->auto_extract_metadata ($doc_obj);
470
471 # Calculate a "base" document ID.
472 if (!defined $id) {
473 $id = &SplitTextFile::get_base_OID($self,$doc_obj);
474 }
475
476 # add an OID
477 &SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
478
479 # process the document
480 $processor->process($doc_obj);
481
482 $self->{'num_processed'} ++;
483 if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
484 last;
485 }
486 }
487
488 delete $self->{'metadata_store'}->{$file};
489
490 # Return number of document objects produced
491 return $count;
492}
493
494sub print_warning {
495 my $self = shift(@_);
496 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
497
498 print $outhandle "CSVPlugin Warning: $file: $error\n";
499 print $failhandle "CSVPlugin Warning: $file: $error\n";
500 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
501
502}
503sub print_error
504{
505
506 my $self = shift(@_);
507 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
508
509 print $outhandle "CSVPlugin Error: $file: $error\n";
510 print $failhandle "CSVPlugin Error: $file: $error\n";
511 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
512}
513
514
5151;
Note: See TracBrowser for help on using the repository browser.