source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/perllib/plugins/CSVPlugin.pm@ 32810

Last change on this file since 32810 was 32810, checked in by davidb, 5 years ago

Improved CSV handling. Holding of committing in main perllib/plugins area until next release of Greenstone3

File size: 6.9 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29use SplitTextFile;
30use MetadataRead;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use Text::CSV;
35
36# CSVPlugin is a sub-class of SplitTextFile.
37sub BEGIN {
38 @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
39}
40
41
42my $arguments =
43 [
44 { 'name' => "separate_char",
45 'desc' => "{CSVPlugin.separate_char}",
46 'type' => "string",
47 'deft' => "auto",
48 'reqd' => "no" },
49 { 'name' => "process_exp",
50 'desc' => "{BaseImporter.process_exp}",
51 'type' => "regexp",
52 'reqd' => "no",
53 'deft' => &get_default_process_exp() },
54 { 'name' => "split_exp",
55 'desc' => "{SplitTextFile.split_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_split_exp(),
59 'hiddengli' => "yes" }
60 ];
61
62
63my $options = { 'name' => "CSVPlugin",
64 'desc' => "{CSVPlugin.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'explodes' => "yes",
68 'args' => $arguments };
69
70
71# This plugin processes files with the suffix ".csv"
72sub get_default_process_exp {
73 return q^(?i)(\.csv)$^;
74}
75
76
77# This plugin splits the input text by line
78sub get_default_split_exp {
79 return q^\r?\n^;
80}
81
82
83sub new
84{
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
90 push(@{$hashArgOptLists->{"OptList"}}, $options);
91
92 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
93
94 return bless $self, $class;
95}
96
97
98sub read_file
99{
100 my $self = shift (@_);
101 my ($filename, $encoding, $language, $textref) = @_;
102
103 # Read in file the usual ReadTextFile way
104 # This ensure that $textref is a unicode aware string
105 $self->SUPER::read_file(@_);
106
107 #
108 # Now top-up the processing of the text with what this plugin
109 # needs
110 #
111
112 # Remove any blank lines so the data is split and processed properly
113 $$textref =~ s/\n(\s*)\n/\n/g;
114
115 # The first line contains the metadata element names
116 $$textref =~ s/^(.*?)\r?\n//;
117 my @csv_file_fields = ();
118 my $csv_file_field_line = $1;
119
120 my $separate_char = $self->{'separate_char'};
121 if ($separate_char =~ m/^auto$/i) {
122 $separate_char = CSVSeparateChar::resolve_auto($csv_file_field_line,$self->{'plugin_type'},$self->{'outhandle'},$self->{'verbosity'});
123 # Replace the 'auto' setting the resolved value (for use later on)
124 $self->{'separate_char'} = $separate_char;
125 }
126
127 my $csv = Text::CSV->new();
128 $csv->sep_char($separate_char);
129
130 if ($csv->parse($csv_file_field_line)) {
131 @csv_file_fields = $csv->fields;
132 }
133 else {
134 print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
135 }
136
137 # $csv_file_field_line .= $separate_char; # To make the regular expressions simpler # ****
138
139 # while ($csv_file_field_line ne "") {
140 # # Handle quoted values
141 # if ($csv_file_field_line =~ s/^\"(.*?)\"${separate_char}//) { # ****
142 # my $csv_file_field = $1;
143 # $csv_file_field =~ s/ //g; # Remove any spaces from the field names
144 # push(@csv_file_fields, $csv_file_field);
145 # }
146 # # Normal comma-separated case
147 # elsif ($csv_file_field_line =~ s/^(.*?)${separate_char}//) { # ****
148 # my $csv_file_field = $1;
149 # $csv_file_field =~ s/ //g; # Remove any spaces from the field names
150 # push(@csv_file_fields, $csv_file_field);
151 # }
152 # # The line must be formatted incorrectly
153 # else {
154 # print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
155 # last;
156 # }
157 # }
158
159 $self->{'csv_file_fields'} = \@csv_file_fields;
160
161 print STDERR "****** csv file fields joined = ", join(" ||| ", @{$self->{'csv_file_fields'}}), "\n";
162}
163
164
165sub process
166{
167 my $self = shift (@_);
168 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
169 my $outhandle = $self->{'outhandle'};
170
171 my $section = $doc_obj->get_top_section();
172 my $csv_line = $$textref;
173 my @csv_file_fields = @{$self->{'csv_file_fields'}};
174
175 # Add the raw line as the document text
176 $doc_obj->add_utf8_text($section, $csv_line);
177
178 my $separate_char = $self->{'separate_char'};
179
180 my $csv = Text::CSV->new();
181 $csv->sep_char($separate_char);
182
183 # Build a hash of metadata name to metadata value for this line
184 if ($csv->parse($csv_line)) {
185 my @md_vals = $csv->fields;
186 my $md_vals_len = scalar(@md_vals);
187
188 for (my $i=0; $i<$md_vals_len; $i++) {
189 my $md_val = $md_vals[$i];
190 # Only bother with non-empty values
191 if ($md_val ne "" && defined($csv_file_fields[$i])) {
192 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
193 }
194 }
195 }
196 else {
197 print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
198 }
199
200
201
202 # # Build a hash of metadata name to metadata value for this line
203 # my $i = 0;
204 # $csv_line .= $separate_char; # To make the regular expressions simpler # ****
205 # while ($csv_line ne "") {
206 # # Metadata values containing commas are quoted
207 # if ($csv_line =~ s/^\"(.*?)\"${separate_char}//) { # ****
208 # # Only bother with non-empty values
209 # my $match = $1;
210 # if ($match ne "" && defined($csv_file_fields[$i])) {
211 # $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $match);
212 # }
213 # }
214 # # Normal comma-separated case
215 # elsif ($csv_line =~ s/^(.*?)${separate_char}//) { # ****
216 # # Only bother with non-empty values
217 # my $match = $1;
218 # if ($match ne "" && defined($csv_file_fields[$i])) {
219 # $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $match);
220 # }
221 # }
222 # # The line must be formatted incorrectly
223 # else {
224 # print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
225 # last;
226 # }
227
228 # $i++;
229 # }
230
231 # Record was processed successfully
232 return 1;
233}
234
235
2361;
Note: See TracBrowser for help on using the repository browser.