Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/perllib/plugins/CSVPlugin.pm@ 32810

Last change on this file since 32810 was 32810, checked in by davidb, 5 years ago
Improved CSV handling. Holding of committing in main perllib/plugins area until next release of Greenstone3
File size: 6.9 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29	use SplitTextFile;
30	use MetadataRead;
31	use strict;
32	no strict 'refs'; # allow filehandles to be variables and viceversa
33
34	use Text::CSV;
35
36	# CSVPlugin is a sub-class of SplitTextFile.
37	sub BEGIN {
38	@CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
39	}
40
41
42	my $arguments =
43	[
44	{ 'name' => "separate_char",
45	'desc' => "{CSVPlugin.separate_char}",
46	'type' => "string",
47	'deft' => "auto",
48	'reqd' => "no" },
49	{ 'name' => "process_exp",
50	'desc' => "{BaseImporter.process_exp}",
51	'type' => "regexp",
52	'reqd' => "no",
53	'deft' => &get_default_process_exp() },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitTextFile.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" }
60	];
61
62
63	my $options = { 'name' => "CSVPlugin",
64	'desc' => "{CSVPlugin.desc}",
65	'abstract' => "no",
66	'inherits' => "yes",
67	'explodes' => "yes",
68	'args' => $arguments };
69
70
71	# This plugin processes files with the suffix ".csv"
72	sub get_default_process_exp {
73	return q^(?i)(\.csv)$^;
74	}
75
76
77	# This plugin splits the input text by line
78	sub get_default_split_exp {
79	return q^\r?\n^;
80	}
81
82
83	sub new
84	{
85	my ($class) = shift (@_);
86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87	push(@$pluginlist, $class);
88
89	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
90	push(@{$hashArgOptLists->{"OptList"}}, $options);
91
92	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
93
94	return bless $self, $class;
95	}
96
97
98	sub read_file
99	{
100	my $self = shift (@_);
101	my ($filename, $encoding, $language, $textref) = @_;
102
103	# Read in file the usual ReadTextFile way
104	# This ensure that $textref is a unicode aware string
105	$self->SUPER::read_file(@_);
106
107	#
108	# Now top-up the processing of the text with what this plugin
109	# needs
110	#
111
112	# Remove any blank lines so the data is split and processed properly
113	$$textref =~ s/\n(\s*)\n/\n/g;
114
115	# The first line contains the metadata element names
116	$$textref =~ s/^(.*?)\r?\n//;
117	my @csv_file_fields = ();
118	my $csv_file_field_line = $1;
119
120	my $separate_char = $self->{'separate_char'};
121	if ($separate_char =~ m/^auto$/i) {
122	$separate_char = CSVSeparateChar::resolve_auto($csv_file_field_line,$self->{'plugin_type'},$self->{'outhandle'},$self->{'verbosity'});
123	# Replace the 'auto' setting the resolved value (for use later on)
124	$self->{'separate_char'} = $separate_char;
125	}
126
127	my $csv = Text::CSV->new();
128	$csv->sep_char($separate_char);
129
130	if ($csv->parse($csv_file_field_line)) {
131	@csv_file_fields = $csv->fields;
132	}
133	else {
134	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
135	}
136
137	# $csv_file_field_line .= $separate_char; # To make the regular expressions simpler # ****
138
139	# while ($csv_file_field_line ne "") {
140	# # Handle quoted values
141	# if ($csv_file_field_line =~ s/^\"(.?)\"${separate_char}//) { # ***
142	# my $csv_file_field = $1;
143	# $csv_file_field =~ s/ //g; # Remove any spaces from the field names
144	# push(@csv_file_fields, $csv_file_field);
145	# }
146	# # Normal comma-separated case
147	# elsif ($csv_file_field_line =~ s/^(.?)${separate_char}//) { # ***
148	# my $csv_file_field = $1;
149	# $csv_file_field =~ s/ //g; # Remove any spaces from the field names
150	# push(@csv_file_fields, $csv_file_field);
151	# }
152	# # The line must be formatted incorrectly
153	# else {
154	# print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
155	# last;
156	# }
157	# }
158
159	$self->{'csv_file_fields'} = \@csv_file_fields;
160
161	print STDERR "****** csv file fields joined = ", join(" \|\|\| ", @{$self->{'csv_file_fields'}}), "\n";
162	}
163
164
165	sub process
166	{
167	my $self = shift (@_);
168	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
169	my $outhandle = $self->{'outhandle'};
170
171	my $section = $doc_obj->get_top_section();
172	my $csv_line = $$textref;
173	my @csv_file_fields = @{$self->{'csv_file_fields'}};
174
175	# Add the raw line as the document text
176	$doc_obj->add_utf8_text($section, $csv_line);
177
178	my $separate_char = $self->{'separate_char'};
179
180	my $csv = Text::CSV->new();
181	$csv->sep_char($separate_char);
182
183	# Build a hash of metadata name to metadata value for this line
184	if ($csv->parse($csv_line)) {
185	my @md_vals = $csv->fields;
186	my $md_vals_len = scalar(@md_vals);
187
188	for (my $i=0; $i<$md_vals_len; $i++) {
189	my $md_val = $md_vals[$i];
190	# Only bother with non-empty values
191	if ($md_val ne "" && defined($csv_file_fields[$i])) {
192	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
193	}
194	}
195	}
196	else {
197	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
198	}
199
200
201
202	# # Build a hash of metadata name to metadata value for this line
203	# my $i = 0;
204	# $csv_line .= $separate_char; # To make the regular expressions simpler # ****
205	# while ($csv_line ne "") {
206	# # Metadata values containing commas are quoted
207	# if ($csv_line =~ s/^\"(.?)\"${separate_char}//) { # ***
208	# # Only bother with non-empty values
209	# my $match = $1;
210	# if ($match ne "" && defined($csv_file_fields[$i])) {
211	# $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $match);
212	# }
213	# }
214	# # Normal comma-separated case
215	# elsif ($csv_line =~ s/^(.?)${separate_char}//) { # ***
216	# # Only bother with non-empty values
217	# my $match = $1;
218	# if ($match ne "" && defined($csv_file_fields[$i])) {
219	# $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $match);
220	# }
221	# }
222	# # The line must be formatted incorrectly
223	# else {
224	# print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
225	# last;
226	# }
227
228	# $i++;
229	# }
230
231	# Record was processed successfully
232	return 1;
233	}
234
235
236	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: