Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 37047

Last change on this file since 37047 was 37047, checked in by davidb, 16 months ago
Introduction of 'metadata_separate_fields', a plugin option that controls which fields get the value separation split applied to. By default all fields get split when the value split character is specified, however you can get situations where you want to split on (say) ',' for a Keyword field but not in a Abstract field that happens to use commas
File size: 16.4 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29	use extrametautil;
30
31	use ReadTextFile;
32	use SplitTextFile; # for a couple routines, but we not inheriting
33	use MetadataRead;
34	use CSVFieldSeparator;
35
36	use strict;
37	no strict 'refs'; # allow filehandles to be variables and viceversa
38
39	use Text::CSV;
40
41	sub BEGIN {
42	@CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43	binmode(STDERR, ":utf8");
44
45	}
46
47
48	my $arguments =
49	[
50	{ 'name' => "process_exp",
51	'desc' => "{BaseImporter.process_exp}",
52	'type' => "regexp",
53	'reqd' => "no",
54	'deft' => &get_default_process_exp() },
55	{ 'name' => "filename_field",
56	'desc' => "{CSVPlugin.filename_field}",
57	'type' => "string",
58	'reqd' => "no",
59	'deft' => "Filename" },
60	{ 'name' => "no_document_if_source_unspecified",
61	'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
62	'type' => "flag",
63	'reqd' => "no"},
64	{ 'name' => "no_document_if_source_missing",
65	'desc' => "{CSVPlugin.no_document_if_source_missing}",
66	'type' => "flag",
67	'reqd' => "no"},
68	{ 'name' => "use_namespace_for_field_names",
69	'desc' => "{CSVPlugin.use_namespace_for_field_names}",
70	'type' => "string",
71	'reqd' => "no"},
72	{ 'name' => "store_field_values_as_document_text",
73	'desc' => "{CSVPlugin.store_field_values_as_document_text}",
74	'type' => "flag",
75	'reqd' => "no"},
76
77
78	];
79
80
81	my $options = { 'name' => "CSVPlugin",
82	'desc' => "{CSVPlugin.desc}",
83	'abstract' => "no",
84	'inherits' => "yes",
85	'explodes' => "yes",
86	'args' => $arguments };
87
88
89	# This plugin processes files with the suffix ".csv"
90	sub get_default_process_exp {
91	return q^(?i)(\.csv)$^;
92	}
93
94	sub new
95	{
96	my ($class) = shift (@_);
97	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
98	push(@$pluginlist, $class);
99
100	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
101	push(@{$hashArgOptLists->{"OptList"}}, $options);
102
103	new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
104	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
105
106	$self->{'textcat_store'} = {};
107	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
108
109	if ((defined $self->{'use_namespace_for_field_names'}) && ($self->{'use_namespace_for_field_names'} =~ m/^\s*$/)) {
110	$self->{'use_namespace_for_field_names'} = undef;
111	}
112
113	return bless $self, $class;
114	}
115
116
117	# mark the file as a metadata file
118	sub file_block_read {
119	my $self = shift (@_);
120	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
121
122	my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
123	return undef unless $self->can_process_this_file($filename_full_path);
124
125	if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
126	# convert to full name - paths stored in block hash are long filenames
127	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
128	}
129	# kjdon - upgrade method converts everyhting to lower case drive letter.
130	# so would we need the following stuff???
131	# my $lower_drive = $filename_full_path;
132	# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
133
134	# my $upper_drive = $filename_full_path;
135	# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
136
137	# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
138	# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
139
140	# }
141	# else {
142	### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
143	# }
144	$block_hash->{'metadata_files'}->{$filename_full_path} = 1;
145	return undef; #1
146	}
147
148	sub metadata_read
149	{
150	my $self = shift (@_);
151	my ($pluginfo, $base_dir, $file, $block_hash,
152	$extrametakeys, $extrametadata, $extrametafile,
153	$processor, $gli, $aux) = @_;
154
155	# can we process this file??
156	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
157	return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
158
159	print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
160	print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
161
162	my $outhandle = $self->{'outhandle'};
163	my $failhandle = $self->{'failhandle'};
164	my $verbosity = $self->{'verbosity'};
165
166	# don't add to block list, as we may do some processing in read.
167
168	# Do encoding stuff
169	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
170	if ($self->{'verbosity'} > 2) {
171	print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
172	}
173	# store these values for read
174	my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
175	$self->{'textcat_store'}->{$file} = $le_rec;
176
177	my $metadata_store = {};
178	$self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
179
180	my $CSV_FILE;
181	open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
182	my $separate_char = $self->{'csv_field_separator'};
183
184	my $md_val_sep = $self->{'metadata_value_separator'};
185	undef $md_val_sep if ($md_val_sep eq "");
186
187	my $csv_file_field_line;
188	if ($separate_char =~ m/^auto$/i) {
189
190	$csv_file_field_line = <$CSV_FILE>;
191	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
192	seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
193	}
194
195	my $md_sep_fields = $self->{'metadata_separate_fields'};
196	undef $md_sep_fields if ($md_sep_fields eq "");
197
198	my $md_sep_fields_lookup = undef;
199	if (defined $md_sep_fields) {
200	$md_sep_fields_lookup = {};
201
202	my @md_fields = split(/\s,\s/,$md_sep_fields);
203
204	for my $md_field (@md_fields) {
205	$md_sep_fields_lookup->{$md_field} = 1;
206	}
207	}
208
209	my $csv = Text::CSV->new();
210	$csv->sep_char($separate_char);
211	$csv->binary(1);
212
213	my @csv_file_fields = undef;
214
215	my $first_row = $csv->getline ($CSV_FILE);
216	if (defined $first_row) {
217	@csv_file_fields = @$first_row;
218	}
219	else {
220	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
221	return -1;
222	}
223
224	my $found_filename_field = 0;
225	my $filename_field = $self->{'filename_field'};
226	for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
227	# Remove any spaces from the field names, and surrounding quotes too
228	$csv_file_fields[$i] =~ s/ //g;
229	$csv_file_fields[$i] =~ s/^"//;
230	$csv_file_fields[$i] =~ s/"$//;
231
232	if ($self->{'use_namespace_for_field_names'}) {
233	$csv_file_fields[$i] = $self->{'use_namespace_for_field_names'}. "." . $csv_file_fields[$i];
234	}
235	if ($csv_file_fields[$i] eq $filename_field) {
236	$found_filename_field = 1;
237	}
238	}
239
240
241	if (!$found_filename_field) {
242	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
243
244	}
245
246	my $count = 0;
247	while (my $csv_line = $csv->getline($CSV_FILE)) {
248	my @md_vals = @$csv_line;
249	$count++;
250
251	# Build a hash of metadata name to metadata value for this line
252	my %csv_line_metadata;
253
254	my $md_vals_len = scalar(@md_vals);
255
256	for (my $i=0; $i<$md_vals_len; $i++) {
257	my $md_val = $md_vals[$i];
258	# Only bother with non-empty values
259	if ($md_val ne "" && defined($csv_file_fields[$i])) {
260
261	my $md_name = $csv_file_fields[$i];
262	$csv_line_metadata{$md_name} = [];
263
264	my $needs_md_val_sep = 0;
265	if (defined $md_val_sep) {
266	# Default coming in is 'no' (0)
267	# => Check to see if any conditions met to turn this into a 'yes' (1)
268
269	# check to see if md_sep_fields is in play, and if it is
270	# => determine if this $md_name is one of the ones in $md_sep_fields_lookup
271
272	if (defined $md_sep_fields_lookup) {
273	if ($md_sep_fields_lookup->{$md_name}) {
274	$needs_md_val_sep = 1;
275	}
276	}
277	else {
278	# if not set, then we apply the md_val_sep to all metadata fields
279	$needs_md_val_sep = 1;
280	}
281	}
282
283	if ($needs_md_val_sep) {
284
285	my @within_md_vals = split(/${md_val_sep}/,$md_val);
286
287	# protect square brackets in metadata values by hex entity encoding them
288	# As unescaped square bracket chars in metadata
289	# have special meaning in GS' Java runtime code
290	my @escaped_within_md_vals = ();
291	for my $meta_value (@within_md_vals) {
292
293	$meta_value =~ s/\[/&\#091;/g;
294	$meta_value =~ s/\]/&\#093;/g;
295	push(@escaped_within_md_vals, $meta_value);
296	}
297	push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
298	}
299	else {
300	# protect square brackets in metadata values by hex entity encoding them
301	my $escaped_metadata_value = $md_val;
302	$escaped_metadata_value =~ s/\[/&\#091;/g;
303	$escaped_metadata_value =~ s/\]/&\#093;/g;
304	push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
305	}
306	}
307	}
308
309	my $csv_line_section_array = $csv_line_metadata{"Section"};
310	my $section_suffix = "";
311	if (defined $csv_line_section_array) {
312	my $section_value = shift(@$csv_line_section_array);
313	if ($section_value =~ /[\d.]+/m){
314	my $section_suffix = "///Section/" . $section_value;
315	foreach my $metaname (keys %csv_line_metadata) {
316	my $new_name = $metaname . $section_suffix;
317	$csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
318	}
319	} else{
320	unshift(@$csv_line_section_array, $section_value);
321	}
322	}
323
324	# do we have filename field?
325	# We can't associate any metadata without knowing the file to associate it with
326	my $has_srcdoc = 0;
327	my $missing_srcdoc = 0;
328	my $csv_line_filename="";;
329	if ($found_filename_field) {
330	# is there a srcdoc mentioned?
331	my $csv_line_filename_array = $csv_line_metadata{$filename_field};
332	if (!defined $csv_line_filename_array) {
333	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
334	} else {
335	$csv_line_filename = shift(@$csv_line_filename_array);
336	# TODO - have an option for whether we do this or not
337	if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($base_dir, $csv_line_filename))) {
338	$has_srcdoc = 1;
339
340	delete $csv_line_metadata{$filename_field};
341	} else {
342	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
343	$missing_srcdoc = 1; # there was one mentioned but its not found
344	}
345	}
346
347	}
348	if ($has_srcdoc) {
349	print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
350	$self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
351	} else {
352	my $store_for_dummy = 1;
353	if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
354	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
355	$store_for_dummy = 0;
356	} elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
357	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
358	$store_for_dummy = 0;
359	}
360	if ($store_for_dummy) {
361
362	print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
363	$metadata_store->{$count} = \%csv_line_metadata;
364	}
365	}
366	} # while csv_line = csv->getline
367	close ($CSV_FILE);
368	}
369
370	#adapted from read in splittextfile
371	sub read {
372	my $self = shift (@_);
373	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
374	my $outhandle = $self->{'outhandle'};
375	my $verbosity = $self->{'verbosity'};
376
377	# can we process this file??
378	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
379	return undef unless $self->can_process_this_file($filename_full_path);
380
381	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
382
383	my $le_rec = $self->{'textcat_store'}->{$file};
384	if (!defined $le_rec) {
385	# means no text was found;
386	return 0; # not processed but no point in passing it on
387	}
388
389	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
390	print $outhandle "$self->{'plugin_type'} processing $file\n"
391	if $self->{'verbosity'} > 1;
392
393	my $language = $le_rec->{'language'};
394	my $encoding = $le_rec->{'encoding'};
395	$self->{'textcat_store'}->{$file} = undef;
396
397	my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
398
399
400	# Process each segment in turn
401	my $segment = 0; #which segment/record number we have
402	my $count = 0; # num doc objs produced
403
404	my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
405	my $plugin_filename_encoding = $self->{'filename_encoding'};
406	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
407
408	my $id;
409
410	foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
411	print $outhandle "processing segment $segment as its own document\n"
412	if $self->{'verbosity'} > 1;
413	$count++;
414	# create a new document
415	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
416	my $cursection = $doc_obj->get_top_section();
417	$doc_obj->add_utf8_metadata($cursection, "Language", $language);
418	$doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
419
420	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
421
422	$doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
423	if ($self->{'cover_image'}) {
424	$self->associate_cover_image($doc_obj, $filename_full_path);
425	}
426	$doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");
427
428	# include any metadata passed in from previous plugins
429	# note that this metadata is associated with the top level section
430	$self->extra_metadata ($doc_obj, $cursection, $metadata);
431
432	# add our stored metadata from metadata_read pass
433	my $segment_metadata = $metadata_store->{$segment};
434	$self->extra_metadata($doc_obj, $cursection, $segment_metadata);
435	if ($self->{'store_field_values_as_document_text'}) {
436	my $new_text = "";
437	foreach my $f (keys %$segment_metadata) {
438	my $values = $segment_metadata->{$f};
439	$new_text .= join (", ", @$values).", ";
440	}
441
442	$doc_obj->add_utf8_text($cursection, $new_text);
443	}
444	# do any automatic metadata extraction - does this make sense??
445	#$self->auto_extract_metadata ($doc_obj);
446
447	# Calculate a "base" document ID.
448	if (!defined $id) {
449	$id = &SplitTextFile::get_base_OID($self,$doc_obj);
450	}
451
452	# add an OID
453	&SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
454
455	# process the document
456	$processor->process($doc_obj);
457
458	$self->{'num_processed'} ++;
459	if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
460	last;
461	}
462	}
463
464	delete $self->{'metadata_store'}->{$file};
465
466	# Return number of document objects produced
467	return $count;
468	}
469
470	sub print_warning {
471	my $self = shift(@_);
472	my ($outhandle, $failhandle, $gli, $file, $error) = @_;
473
474	print $outhandle "CSVPlugin Warning: $file: $error\n";
475	print $failhandle "CSVPlugin Warning: $file: $error\n";
476	print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
477
478	}
479	sub print_error
480	{
481
482	my $self = shift(@_);
483	my ($outhandle, $failhandle, $gli, $file, $error) = @_;
484
485	print $outhandle "CSVPlugin Error: $file: $error\n";
486	print $failhandle "CSVPlugin Error: $file: $error\n";
487	print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
488	}
489
490
491	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: