Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

CSVPlugin.pm

Last change on this file was 38797, checked in by kjdon, 8 weeks ago
added option '-ignore_field' - if there is a column with this name in the spreadsheet, and a line is non empty for this column, then ignore the line.
File size: 17.0 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29	use extrametautil;
30
31	use ReadTextFile;
32	use SplitTextFile; # for a couple routines, but we not inheriting
33	use MetadataRead;
34	use CSVFieldSeparator;
35
36	use strict;
37	no strict 'refs'; # allow filehandles to be variables and viceversa
38
39	use Text::CSV;
40
41	sub BEGIN {
42	@CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43	binmode(STDERR, ":utf8");
44
45	}
46
47
48	my $arguments =
49	[
50	{ 'name' => "process_exp",
51	'desc' => "{BaseImporter.process_exp}",
52	'type' => "regexp",
53	'reqd' => "no",
54	'deft' => &get_default_process_exp() },
55	{ 'name' => "filename_field",
56	'desc' => "{CSVPlugin.filename_field}",
57	'type' => "string",
58	'reqd' => "no",
59	'deft' => "Filename" },
60	{ 'name' => "no_document_if_source_unspecified",
61	'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
62	'type' => "flag",
63	'reqd' => "no"},
64	{ 'name' => "no_document_if_source_missing",
65	'desc' => "{CSVPlugin.no_document_if_source_missing}",
66	'type' => "flag",
67	'reqd' => "no"},
68	{ 'name' => "use_namespace_for_field_names",
69	'desc' => "{CSVPlugin.use_namespace_for_field_names}",
70	'type' => "string",
71	'reqd' => "no"},
72	{ 'name' => "store_field_values_as_document_text",
73	'desc' => "{CSVPlugin.store_field_values_as_document_text}",
74	'type' => "flag",
75	'reqd' => "no"},
76	{ 'name' => "ignore_field",
77	'desc' => "{CSVPlugin.ignore_field}",
78	'type' => "string",
79	'reqd' => "no"},
80
81
82	];
83
84
85	my $options = { 'name' => "CSVPlugin",
86	'desc' => "{CSVPlugin.desc}",
87	'abstract' => "no",
88	'inherits' => "yes",
89	'explodes' => "yes",
90	'args' => $arguments };
91
92
93	# This plugin processes files with the suffix ".csv"
94	sub get_default_process_exp {
95	return q^(?i)(\.csv)$^;
96	}
97
98	sub new
99	{
100	my ($class) = shift (@_);
101	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
102	push(@$pluginlist, $class);
103
104	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
105	push(@{$hashArgOptLists->{"OptList"}}, $options);
106
107	new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
108	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
109
110	$self->{'textcat_store'} = {};
111	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
112
113	if ((defined $self->{'use_namespace_for_field_names'}) && ($self->{'use_namespace_for_field_names'} =~ m/^\s*$/)) {
114	$self->{'use_namespace_for_field_names'} = undef;
115	}
116	if ((defined $self->{'ignore_field'}) && ($self->{'ignore_field'} =~ m/^\s*$/)) {
117	$self->{'ignore_field'} = undef;
118	}
119
120	return bless $self, $class;
121	}
122
123
124	# mark the file as a metadata file
125	sub file_block_read {
126	my $self = shift (@_);
127	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
128
129	my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
130	return undef unless $self->can_process_this_file($filename_full_path);
131
132	if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
133	# convert to full name - paths stored in block hash are long filenames
134	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
135	}
136	# kjdon - upgrade method converts everyhting to lower case drive letter.
137	# so would we need the following stuff???
138	# my $lower_drive = $filename_full_path;
139	# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
140
141	# my $upper_drive = $filename_full_path;
142	# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
143
144	# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
145	# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
146
147	# }
148	# else {
149	### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
150	# }
151	$block_hash->{'metadata_files'}->{$filename_full_path} = 1;
152	return undef; #1
153	}
154
155	sub metadata_read
156	{
157	my $self = shift (@_);
158	my ($pluginfo, $base_dir, $file, $block_hash,
159	$extrametakeys, $extrametadata, $extrametafile,
160	$processor, $gli, $aux) = @_;
161
162	# can we process this file??
163	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
164	return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
165	# the current directory
166	my $current_dir = &util::filename_head($filename_full_path);
167
168	print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
169	print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
170
171	my $outhandle = $self->{'outhandle'};
172	my $failhandle = $self->{'failhandle'};
173	my $verbosity = $self->{'verbosity'};
174
175	# don't add to block list, as we may do some processing in read.
176
177	# Do encoding stuff
178	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
179	if ($self->{'verbosity'} > 2) {
180	print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
181	}
182	# store these values for read
183	my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
184	$self->{'textcat_store'}->{$file} = $le_rec;
185
186	my $metadata_store = {};
187	$self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
188
189	my $CSV_FILE;
190	open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
191	my $separate_char = $self->{'csv_field_separator'};
192
193	my $md_val_sep = $self->{'metadata_value_separator'};
194	undef $md_val_sep if ($md_val_sep eq "");
195
196	my $csv_file_field_line;
197	if ($separate_char =~ m/^auto$/i) {
198
199	$csv_file_field_line = <$CSV_FILE>;
200	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
201	seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
202	}
203
204	my $md_sep_fields = $self->{'metadata_separate_fields'};
205	undef $md_sep_fields if ($md_sep_fields eq "");
206
207	my $md_sep_fields_lookup = undef;
208	if (defined $md_sep_fields) {
209	$md_sep_fields_lookup = {};
210
211	my @md_fields = split(/\s,\s/,$md_sep_fields);
212
213	for my $md_field (@md_fields) {
214	$md_sep_fields_lookup->{$md_field} = 1;
215	}
216	}
217
218	my $csv = Text::CSV->new();
219	$csv->sep_char($separate_char);
220	$csv->binary(1);
221
222	my @csv_file_fields = undef;
223
224	my $first_row = $csv->getline ($CSV_FILE);
225	if (defined $first_row) {
226	@csv_file_fields = @$first_row;
227	}
228	else {
229	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
230	return -1;
231	}
232
233	my $found_filename_field = 0;
234	my $filename_field = $self->{'filename_field'};
235	my $ignore_field = $self->{'ignore_field'};
236	my $ignore_col;
237	for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
238	# Remove any spaces from the field names, and surrounding quotes too
239	$csv_file_fields[$i] =~ s/ //g;
240	$csv_file_fields[$i] =~ s/^"//;
241	$csv_file_fields[$i] =~ s/"$//;
242
243
244	if ($self->{'use_namespace_for_field_names'}) {
245	$csv_file_fields[$i] = $self->{'use_namespace_for_field_names'}. "." . $csv_file_fields[$i];
246	}
247	if ($csv_file_fields[$i] eq $filename_field) {
248	$found_filename_field = 1;
249	}
250	if ($ignore_field && $csv_file_fields[$i] eq $ignore_field) {
251	$ignore_col = $i;
252	}
253
254	}
255
256
257	if (!$found_filename_field) {
258	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
259
260	}
261
262
263
264	my $count = 0;
265	while (my $csv_line = $csv->getline($CSV_FILE)) {
266	my @md_vals = @$csv_line;
267
268	if (defined $ignore_col && $md_vals[$ignore_col] ne "") {
269	# ignore this line
270	print STDERR "ignoring line ".join(",", @md_vals)."\n";
271	next;
272	}
273	$count++;
274
275	# Build a hash of metadata name to metadata value for this line
276	my %csv_line_metadata;
277
278	my $md_vals_len = scalar(@md_vals);
279
280	for (my $i=0; $i<$md_vals_len; $i++) {
281	my $md_val = $md_vals[$i];
282	# Only bother with non-empty values
283	if ($md_val ne "" && defined($csv_file_fields[$i])) {
284
285	my $md_name = $csv_file_fields[$i];
286	$csv_line_metadata{$md_name} = [];
287
288	my $needs_md_val_sep = 0;
289	if (defined $md_val_sep) {
290	# Default coming in is 'no' (0)
291	# => Check to see if any conditions met to turn this into a 'yes' (1)
292
293	# check to see if md_sep_fields is in play, and if it is
294	# => determine if this $md_name is one of the ones in $md_sep_fields_lookup
295
296	if (defined $md_sep_fields_lookup) {
297	if ($md_sep_fields_lookup->{$md_name}) {
298	$needs_md_val_sep = 1;
299	}
300	}
301	else {
302	# if not set, then we apply the md_val_sep to all metadata fields
303	$needs_md_val_sep = 1;
304	}
305	}
306
307	if ($needs_md_val_sep) {
308
309	my @within_md_vals = split(/${md_val_sep}/,$md_val);
310
311	# protect square brackets in metadata values by hex entity encoding them
312	# As unescaped square bracket chars in metadata
313	# have special meaning in GS' Java runtime code
314	my @escaped_within_md_vals = ();
315	for my $meta_value (@within_md_vals) {
316
317	$meta_value =~ s/\[/&\#091;/g;
318	$meta_value =~ s/\]/&\#093;/g;
319	push(@escaped_within_md_vals, $meta_value);
320	}
321	push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
322	}
323	else {
324	# protect square brackets in metadata values by hex entity encoding them
325	my $escaped_metadata_value = $md_val;
326	$escaped_metadata_value =~ s/\[/&\#091;/g;
327	$escaped_metadata_value =~ s/\]/&\#093;/g;
328	push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
329	}
330	}
331	}
332
333	my $csv_line_section_array = $csv_line_metadata{"Section"};
334	my $section_suffix = "";
335	if (defined $csv_line_section_array) {
336	my $section_value = shift(@$csv_line_section_array);
337	if ($section_value =~ /[\d.]+/m){
338	my $section_suffix = "///Section/" . $section_value;
339	foreach my $metaname (keys %csv_line_metadata) {
340	my $new_name = $metaname . $section_suffix;
341	$csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
342	}
343	} else{
344	unshift(@$csv_line_section_array, $section_value);
345	}
346	}
347
348	# do we have filename field?
349	# We can't associate any metadata without knowing the file to associate it with
350	my $has_srcdoc = 0;
351	my $missing_srcdoc = 0;
352	my $csv_line_filename="";;
353	if ($found_filename_field) {
354	# is there a srcdoc mentioned?
355	my $csv_line_filename_array = $csv_line_metadata{$filename_field};
356	if (!defined $csv_line_filename_array) {
357	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
358	} else {
359	$csv_line_filename = shift(@$csv_line_filename_array);
360	# TODO - have an option for whether we do this or not
361	if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($current_dir, $csv_line_filename))) {
362	$has_srcdoc = 1;
363
364	delete $csv_line_metadata{$filename_field};
365	} else {
366	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
367	$missing_srcdoc = 1; # there was one mentioned but its not found
368	}
369	}
370
371	}
372	if ($has_srcdoc) {
373	print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
374	$self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
375	} else {
376	my $store_for_dummy = 1;
377	if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
378	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
379	$store_for_dummy = 0;
380	} elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
381	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
382	$store_for_dummy = 0;
383	}
384	if ($store_for_dummy) {
385
386	print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
387	$metadata_store->{$count} = \%csv_line_metadata;
388	}
389	}
390	} # while csv_line = csv->getline
391	close ($CSV_FILE);
392	}
393
394	#adapted from read in splittextfile
395	sub read {
396	my $self = shift (@_);
397	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
398	my $outhandle = $self->{'outhandle'};
399	my $verbosity = $self->{'verbosity'};
400
401	# can we process this file??
402	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
403	return undef unless $self->can_process_this_file($filename_full_path);
404
405	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
406
407	my $le_rec = $self->{'textcat_store'}->{$file};
408	if (!defined $le_rec) {
409	# means no text was found;
410	return 0; # not processed but no point in passing it on
411	}
412
413	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
414	print $outhandle "$self->{'plugin_type'} processing $file\n"
415	if $self->{'verbosity'} > 1;
416
417	my $language = $le_rec->{'language'};
418	my $encoding = $le_rec->{'encoding'};
419	$self->{'textcat_store'}->{$file} = undef;
420
421	my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
422
423
424	# Process each segment in turn
425	my $segment = 0; #which segment/record number we have
426	my $count = 0; # num doc objs produced
427
428	my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
429	my $plugin_filename_encoding = $self->{'filename_encoding'};
430	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
431
432	my $id;
433
434	foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
435	print $outhandle "processing segment $segment as its own document\n"
436	if $self->{'verbosity'} > 1;
437	$count++;
438	# create a new document
439	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
440	my $cursection = $doc_obj->get_top_section();
441	$doc_obj->add_utf8_metadata($cursection, "Language", $language);
442	$doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
443
444	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
445
446	$doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
447	if ($self->{'cover_image'}) {
448	$self->associate_cover_image($doc_obj, $filename_full_path);
449	}
450	$doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");
451
452	# include any metadata passed in from previous plugins
453	# note that this metadata is associated with the top level section
454	$self->extra_metadata ($doc_obj, $cursection, $metadata);
455
456	# add our stored metadata from metadata_read pass
457	my $segment_metadata = $metadata_store->{$segment};
458	$self->extra_metadata($doc_obj, $cursection, $segment_metadata);
459	if ($self->{'store_field_values_as_document_text'}) {
460	my $new_text = "";
461	foreach my $f (keys %$segment_metadata) {
462	my $values = $segment_metadata->{$f};
463	$new_text .= join (", ", @$values).", ";
464	}
465
466	$doc_obj->add_utf8_text($cursection, $new_text);
467	}
468	# do any automatic metadata extraction - does this make sense??
469	#$self->auto_extract_metadata ($doc_obj);
470
471	# Calculate a "base" document ID.
472	if (!defined $id) {
473	$id = &SplitTextFile::get_base_OID($self,$doc_obj);
474	}
475
476	# add an OID
477	&SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
478
479	# process the document
480	$processor->process($doc_obj);
481
482	$self->{'num_processed'} ++;
483	if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
484	last;
485	}
486	}
487
488	delete $self->{'metadata_store'}->{$file};
489
490	# Return number of document objects produced
491	return $count;
492	}
493
494	sub print_warning {
495	my $self = shift(@_);
496	my ($outhandle, $failhandle, $gli, $file, $error) = @_;
497
498	print $outhandle "CSVPlugin Warning: $file: $error\n";
499	print $failhandle "CSVPlugin Warning: $file: $error\n";
500	print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
501
502	}
503	sub print_error
504	{
505
506	my $self = shift(@_);
507	my ($outhandle, $failhandle, $gli, $file, $error) = @_;
508
509	print $outhandle "CSVPlugin Error: $file: $error\n";
510	print $failhandle "CSVPlugin Error: $file: $error\n";
511	print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
512	}
513
514
515	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm

Download in other formats: