Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 34249

Last change on this file since 34249 was 34249, checked in by ak19, 4 years ago
Dr Bainbridge in his commit 32810 had expressed that he intended to commit his MetadataCSVPlugin related work for dlheritage to the main GS after the then upcoming GS3 release. His plugin changes support multiple values for a metadata field work and these changes for me in the GS3tutorials collection that uses a metadata.csv file. Like dlheritage, I also use the pipe symbol to separate multiple meta values for a meta field/column. Kathy had made a bugfix to MetadataCSVPlugin since Dr Bainbridge's branched the code off for dlheritage. I will incorporate her bugfix into Dr Bainbridge's work and test things still work and will commit that separately next. Committing from uni machine, as something weird about WMTB VM where I tested these plugin changes and additions: svn committing hasn't been working for a few days now but freezes trying to transmit data.
Property svn:keywords set to `Author Date Id Revision`
File size: 5.3 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29	use SplitTextFile;
30	use MetadataRead;
31	use CSVFieldSeparator;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	use Text::CSV;
37
38	# CSVPlugin is a sub-class of SplitTextFile.
39	sub BEGIN {
40	@CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator');
41	}
42
43
44	my $arguments =
45	[
46	{ 'name' => "process_exp",
47	'desc' => "{BaseImporter.process_exp}",
48	'type' => "regexp",
49	'reqd' => "no",
50	'deft' => &get_default_process_exp() },
51	{ 'name' => "split_exp",
52	'desc' => "{SplitTextFile.split_exp}",
53	'type' => "regexp",
54	'reqd' => "no",
55	'deft' => &get_default_split_exp(),
56	'hiddengli' => "yes" }
57	];
58
59
60	my $options = { 'name' => "CSVPlugin",
61	'desc' => "{CSVPlugin.desc}",
62	'abstract' => "no",
63	'inherits' => "yes",
64	'explodes' => "yes",
65	'args' => $arguments };
66
67
68	# This plugin processes files with the suffix ".csv"
69	sub get_default_process_exp {
70	return q^(?i)(\.csv)$^;
71	}
72
73
74	# This plugin splits the input text by line
75	sub get_default_split_exp {
76	return q^\r?\n^;
77	}
78
79
80	sub new
81	{
82	my ($class) = shift (@_);
83	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
84	push(@$pluginlist, $class);
85
86	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
87	push(@{$hashArgOptLists->{"OptList"}}, $options);
88
89	new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
90	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
91
92	return bless $self, $class;
93	}
94
95
96	sub read_file
97	{
98	my $self = shift (@_);
99	my ($filename, $encoding, $language, $textref) = @_;
100
101	# Read in file the usual ReadTextFile way
102	# This ensure that $textref is a unicode aware string
103	$self->SUPER::read_file(@_);
104
105	#
106	# Now top-up the processing of the text with what this plugin
107	# needs
108	#
109
110	# Remove any blank lines so the data is split and processed properly
111	$$textref =~ s/\n(\s*)\n/\n/g;
112
113	# The first line contains the metadata element names
114	$$textref =~ s/^(.*?)\r?\n//;
115	my @csv_file_fields = ();
116	my $csv_file_field_line = $1;
117
118	my $separate_char = $self->{'csv_field_separator'};
119	if ($separate_char =~ m/^auto$/i) {
120	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
121	# Replace the 'auto' setting the resolved value (for use later on)
122	$self->{'separate_char'} = $separate_char;
123	}
124
125	my $csv = Text::CSV->new();
126	$csv->sep_char($separate_char);
127
128	if ($csv->parse($csv_file_field_line)) {
129	@csv_file_fields = $csv->fields;
130	}
131	else {
132	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
133	}
134
135	$self->{'csv_file_fields'} = \@csv_file_fields;
136
137	# print STDERR "**** CSV file fields joined = ", join(" \|\|\| ", @{$self->{'csv_file_fields'}}), "\n";
138	}
139
140
141	sub process
142	{
143	my $self = shift (@_);
144	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
145	my $outhandle = $self->{'outhandle'};
146
147	my $section = $doc_obj->get_top_section();
148	my $csv_line = $$textref;
149	my @csv_file_fields = @{$self->{'csv_file_fields'}};
150
151	# Add the raw line as the document text
152	$doc_obj->add_utf8_text($section, $csv_line);
153
154	my $separate_char = $self->{'separate_char'};
155
156	my $md_val_sep = $self->{'metadata_value_separator'};
157	undef $md_val_sep if ($md_val_sep eq "");
158
159	my $csv = Text::CSV->new();
160	$csv->sep_char($separate_char);
161
162	# Build a hash of metadata name to metadata value for this line
163	if ($csv->parse($csv_line)) {
164	my @md_vals = $csv->fields;
165	my $md_vals_len = scalar(@md_vals);
166
167	for (my $i=0; $i<$md_vals_len; $i++) {
168	my $md_val = $md_vals[$i];
169	# Only bother with non-empty values
170	if ($md_val ne "" && defined($csv_file_fields[$i])) {
171	if (defined $md_val_sep) {
172	my $md_name = $csv_file_fields[$i];
173
174	my @within_md_vals = split(/${md_val_sep}/,$md_val);
175	foreach my $within_md_val (@within_md_vals) {
176	$doc_obj->add_utf8_metadata($section, $md_name, $within_md_val);
177	}
178	}
179	else {
180	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
181	}
182	}
183	}
184	}
185	else {
186	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
187	}
188
189	# Record was processed successfully
190	return 1;
191	}
192
193
194	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: