Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/perllib/plugins/CSVPlugin.pm@ 32810

Last change on this file since 32810 was 32810, checked in by davidb, 5 years ago
Improved CSV handling. Holding of committing in main perllib/plugins area until next release of Greenstone3
File size: 6.9 KB

Rev	Line
[32810]	1	###########################################################################
	2	#
	3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2006 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	package CSVPlugin;
	28
	29	use SplitTextFile;
	30	use MetadataRead;
	31	use strict;
	32	no strict 'refs'; # allow filehandles to be variables and viceversa
	33
	34	use Text::CSV;
	35
	36	# CSVPlugin is a sub-class of SplitTextFile.
	37	sub BEGIN {
	38	@CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
	39	}
	40
	41
	42	my $arguments =
	43	[
	44	{ 'name' => "separate_char",
	45	'desc' => "{CSVPlugin.separate_char}",
	46	'type' => "string",
	47	'deft' => "auto",
	48	'reqd' => "no" },
	49	{ 'name' => "process_exp",
	50	'desc' => "{BaseImporter.process_exp}",
	51	'type' => "regexp",
	52	'reqd' => "no",
	53	'deft' => &get_default_process_exp() },
	54	{ 'name' => "split_exp",
	55	'desc' => "{SplitTextFile.split_exp}",
	56	'type' => "regexp",
	57	'reqd' => "no",
	58	'deft' => &get_default_split_exp(),
	59	'hiddengli' => "yes" }
	60	];
	61
	62
	63	my $options = { 'name' => "CSVPlugin",
	64	'desc' => "{CSVPlugin.desc}",
	65	'abstract' => "no",
	66	'inherits' => "yes",
	67	'explodes' => "yes",
	68	'args' => $arguments };
	69
	70
	71	# This plugin processes files with the suffix ".csv"
	72	sub get_default_process_exp {
	73	return q^(?i)(\.csv)$^;
	74	}
	75
	76
	77	# This plugin splits the input text by line
	78	sub get_default_split_exp {
	79	return q^\r?\n^;
	80	}
	81
	82
	83	sub new
	84	{
	85	my ($class) = shift (@_);
	86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	87	push(@$pluginlist, $class);
	88
	89	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
	90	push(@{$hashArgOptLists->{"OptList"}}, $options);
	91
	92	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
	93
	94	return bless $self, $class;
	95	}
	96
	97
	98	sub read_file
	99	{
	100	my $self = shift (@_);
	101	my ($filename, $encoding, $language, $textref) = @_;
	102
	103	# Read in file the usual ReadTextFile way
	104	# This ensure that $textref is a unicode aware string
	105	$self->SUPER::read_file(@_);
	106
	107	#
	108	# Now top-up the processing of the text with what this plugin
	109	# needs
	110	#
	111
	112	# Remove any blank lines so the data is split and processed properly
	113	$$textref =~ s/\n(\s*)\n/\n/g;
	114
	115	# The first line contains the metadata element names
	116	$$textref =~ s/^(.*?)\r?\n//;
	117	my @csv_file_fields = ();
	118	my $csv_file_field_line = $1;
	119
	120	my $separate_char = $self->{'separate_char'};
	121	if ($separate_char =~ m/^auto$/i) {
	122	$separate_char = CSVSeparateChar::resolve_auto($csv_file_field_line,$self->{'plugin_type'},$self->{'outhandle'},$self->{'verbosity'});
	123	# Replace the 'auto' setting the resolved value (for use later on)
	124	$self->{'separate_char'} = $separate_char;
	125	}
	126
	127	my $csv = Text::CSV->new();
	128	$csv->sep_char($separate_char);
	129
	130	if ($csv->parse($csv_file_field_line)) {
	131	@csv_file_fields = $csv->fields;
	132	}
	133	else {
	134	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
	135	}
	136
	137	# $csv_file_field_line .= $separate_char; # To make the regular expressions simpler # ****
	138
	139	# while ($csv_file_field_line ne "") {
	140	# # Handle quoted values
	141	# if ($csv_file_field_line =~ s/^\"(.?)\"${separate_char}//) { # ***
	142	# my $csv_file_field = $1;
	143	# $csv_file_field =~ s/ //g; # Remove any spaces from the field names
	144	# push(@csv_file_fields, $csv_file_field);
	145	# }
	146	# # Normal comma-separated case
	147	# elsif ($csv_file_field_line =~ s/^(.?)${separate_char}//) { # ***
	148	# my $csv_file_field = $1;
	149	# $csv_file_field =~ s/ //g; # Remove any spaces from the field names
	150	# push(@csv_file_fields, $csv_file_field);
	151	# }
	152	# # The line must be formatted incorrectly
	153	# else {
	154	# print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
	155	# last;
	156	# }
	157	# }
	158
	159	$self->{'csv_file_fields'} = \@csv_file_fields;
	160
	161	print STDERR "****** csv file fields joined = ", join(" \|\|\| ", @{$self->{'csv_file_fields'}}), "\n";
	162	}
	163
	164
	165	sub process
	166	{
	167	my $self = shift (@_);
	168	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	169	my $outhandle = $self->{'outhandle'};
	170
	171	my $section = $doc_obj->get_top_section();
	172	my $csv_line = $$textref;
	173	my @csv_file_fields = @{$self->{'csv_file_fields'}};
	174
	175	# Add the raw line as the document text
	176	$doc_obj->add_utf8_text($section, $csv_line);
	177
	178	my $separate_char = $self->{'separate_char'};
	179
	180	my $csv = Text::CSV->new();
	181	$csv->sep_char($separate_char);
	182
	183	# Build a hash of metadata name to metadata value for this line
	184	if ($csv->parse($csv_line)) {
	185	my @md_vals = $csv->fields;
	186	my $md_vals_len = scalar(@md_vals);
	187
	188	for (my $i=0; $i<$md_vals_len; $i++) {
	189	my $md_val = $md_vals[$i];
	190	# Only bother with non-empty values
	191	if ($md_val ne "" && defined($csv_file_fields[$i])) {
	192	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
	193	}
	194	}
	195	}
	196	else {
	197	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
	198	}
	199
	200
	201
	202	# # Build a hash of metadata name to metadata value for this line
	203	# my $i = 0;
	204	# $csv_line .= $separate_char; # To make the regular expressions simpler # ****
	205	# while ($csv_line ne "") {
	206	# # Metadata values containing commas are quoted
	207	# if ($csv_line =~ s/^\"(.?)\"${separate_char}//) { # ***
	208	# # Only bother with non-empty values
	209	# my $match = $1;
	210	# if ($match ne "" && defined($csv_file_fields[$i])) {
	211	# $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $match);
	212	# }
	213	# }
	214	# # Normal comma-separated case
	215	# elsif ($csv_line =~ s/^(.?)${separate_char}//) { # ***
	216	# # Only bother with non-empty values
	217	# my $match = $1;
	218	# if ($match ne "" && defined($csv_file_fields[$i])) {
	219	# $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $match);
	220	# }
	221	# }
	222	# # The line must be formatted incorrectly
	223	# else {
	224	# print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
	225	# last;
	226	# }
	227
	228	# $i++;
	229	# }
	230
	231	# Record was processed successfully
	232	return 1;
	233	}
	234
	235
	236	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: