Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: main/trunk/greenstone2/perllib/plugins/CSVDeprecatedPlugin.pm@ 36479

Last change on this file since 36479 was 36479, checked in by kjdon, 20 months ago
renaming the old CSVPlugin and MetadataCSVPlugin to Deprecated versions, prior to adding the new CSVPlugin which handles both cases, and is new and improved.
Property svn:keywords set to `Author Date Id Revision`
File size: 5.6 KB

Rev	Line
[11918]	1	###########################################################################
	2	#
[15872]	3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
[11918]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2006 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package CSVPlugin;
[11918]	28
[15872]	29	use SplitTextFile;
[24794]	30	use MetadataRead;
[34249]	31	use CSVFieldSeparator;
	32
[11918]	33	use strict;
[12610]	34	no strict 'refs'; # allow filehandles to be variables and viceversa
[11918]	35
[34249]	36	use Text::CSV;
[11918]	37
[15872]	38	# CSVPlugin is a sub-class of SplitTextFile.
[11918]	39	sub BEGIN {
[34249]	40	@CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator');
[11918]	41	}
	42
	43
[12610]	44	my $arguments =
[34249]	45	[
	46	{ 'name' => "process_exp",
[31492]	47	'desc' => "{BaseImporter.process_exp}",
[11918]	48	'type' => "regexp",
	49	'reqd' => "no",
[12610]	50	'deft' => &get_default_process_exp() },
	51	{ 'name' => "split_exp",
[15872]	52	'desc' => "{SplitTextFile.split_exp}",
[12610]	53	'type' => "regexp",
	54	'reqd' => "no",
	55	'deft' => &get_default_split_exp(),
	56	'hiddengli' => "yes" }
	57	];
[11918]	58
	59
[15872]	60	my $options = { 'name' => "CSVPlugin",
	61	'desc' => "{CSVPlugin.desc}",
[11918]	62	'abstract' => "no",
	63	'inherits' => "yes",
[12610]	64	'explodes' => "yes",
[11918]	65	'args' => $arguments };
	66
	67
[12610]	68	# This plugin processes files with the suffix ".csv"
	69	sub get_default_process_exp {
	70	return q^(?i)(\.csv)$^;
	71	}
	72
	73
	74	# This plugin splits the input text by line
	75	sub get_default_split_exp {
	76	return q^\r?\n^;
	77	}
	78
	79
[11918]	80	sub new
	81	{
	82	my ($class) = shift (@_);
	83	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	84	push(@$pluginlist, $class);
	85
[15872]	86	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
	87	push(@{$hashArgOptLists->{"OptList"}}, $options);
[11918]	88
[34249]	89	new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
[15872]	90	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[11918]	91
	92	return bless $self, $class;
	93	}
	94
	95
[12610]	96	sub read_file
[11918]	97	{
[12610]	98	my $self = shift (@_);
	99	my ($filename, $encoding, $language, $textref) = @_;
[11918]	100
[28782]	101	# Read in file the usual ReadTextFile way
	102	# This ensure that $textref is a unicode aware string
	103	$self->SUPER::read_file(@_);
[11918]	104
[28782]	105	#
	106	# Now top-up the processing of the text with what this plugin
	107	# needs
	108	#
	109
[12610]	110	# Remove any blank lines so the data is split and processed properly
	111	$$textref =~ s/\n(\s*)\n/\n/g;
	112
	113	# The first line contains the metadata element names
	114	$$textref =~ s/^(.*?)\r?\n//;
[12627]	115	my @csv_file_fields = ();
[34249]	116	my $csv_file_field_line = $1;
	117
	118	my $separate_char = $self->{'csv_field_separator'};
	119	if ($separate_char =~ m/^auto$/i) {
	120	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
	121	# Replace the 'auto' setting the resolved value (for use later on)
	122	$self->{'separate_char'} = $separate_char;
[12610]	123	}
[33389]	124
[34249]	125	my $csv = Text::CSV->new();
	126	$csv->sep_char($separate_char);
	127
	128	if ($csv->parse($csv_file_field_line)) {
	129	@csv_file_fields = $csv->fields;
	130	}
	131	else {
	132	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
	133	}
	134
[34250]	135	#$self->{'csv_file_fields'} = \@csv_file_fields;
	136	$self->{'csv_file_fields'}->{$filename} = \@csv_file_fields;
	137	###print STDERR "**** CSV file fields joined ($filename) = ", join(" \|\|\| ", @{$self->{'csv_file_fields'}->{$filename}}), "\n";
	138
[11918]	139	}
	140
	141
[12610]	142	sub process
[11918]	143	{
	144	my $self = shift (@_);
[12610]	145	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	146	my $outhandle = $self->{'outhandle'};
[11918]	147
[12610]	148	my $section = $doc_obj->get_top_section();
	149	my $csv_line = $$textref;
[34250]	150	#my @csv_file_fields = @{$self->{'csv_file_fields'}};
	151	my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
	152	my @csv_file_fields = @{$self->{'csv_file_fields'}->{$filename_full_path}};
	153
	154	###print STDERR "**** CSV file fields joined = ", join(" \|\|\| ", @csv_file_fields), "\n";
	155
[34249]	156	# Add the raw line as the document text
	157	$doc_obj->add_utf8_text($section, $csv_line);
[12610]	158
[34249]	159	my $separate_char = $self->{'separate_char'};
[33389]	160
[34249]	161	my $md_val_sep = $self->{'metadata_value_separator'};
	162	undef $md_val_sep if ($md_val_sep eq "");
[33389]	163
[34249]	164	my $csv = Text::CSV->new();
	165	$csv->sep_char($separate_char);
	166
	167	# Build a hash of metadata name to metadata value for this line
	168	if ($csv->parse($csv_line)) {
	169	my @md_vals = $csv->fields;
	170	my $md_vals_len = scalar(@md_vals);
[11918]	171
[34249]	172	for (my $i=0; $i<$md_vals_len; $i++) {
	173	my $md_val = $md_vals[$i];
[12610]	174	# Only bother with non-empty values
[34249]	175	if ($md_val ne "" && defined($csv_file_fields[$i])) {
	176	if (defined $md_val_sep) {
	177	my $md_name = $csv_file_fields[$i];
	178
	179	my @within_md_vals = split(/${md_val_sep}/,$md_val);
	180	foreach my $within_md_val (@within_md_vals) {
	181	$doc_obj->add_utf8_metadata($section, $md_name, $within_md_val);
	182	}
	183	}
	184	else {
	185	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
	186	}
[11918]	187	}
[12610]	188	}
[11918]	189	}
[34249]	190	else {
	191	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
	192	}
[12610]	193
	194	# Record was processed successfully
	195	return 1;
[11918]	196	}
	197
	198
	199	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: