Context Navigation

source: gsdl/trunk/perllib/plugins/CSVPlugin.pm@ 16698

Last change on this file since 16698 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 5.0 KB

Rev	Line
[11918]	1	###########################################################################
	2	#
[15872]	3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
[11918]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2006 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package CSVPlugin;
[11918]	28
	29
[15872]	30	use SplitTextFile;
[11918]	31	use strict;
[12610]	32	no strict 'refs'; # allow filehandles to be variables and viceversa
[11918]	33
	34
[15872]	35	# CSVPlugin is a sub-class of SplitTextFile.
[11918]	36	sub BEGIN {
[15872]	37	@CSVPlugin::ISA = ('SplitTextFile');
[11918]	38	}
	39
	40
[12610]	41	my $arguments =
	42	[ { 'name' => "process_exp",
[15872]	43	'desc' => "{BasePlugin.process_exp}",
[11918]	44	'type' => "regexp",
	45	'reqd' => "no",
[12610]	46	'deft' => &get_default_process_exp() },
	47	{ 'name' => "split_exp",
[15872]	48	'desc' => "{SplitTextFile.split_exp}",
[12610]	49	'type' => "regexp",
	50	'reqd' => "no",
	51	'deft' => &get_default_split_exp(),
	52	'hiddengli' => "yes" }
	53	];
[11918]	54
	55
[15872]	56	my $options = { 'name' => "CSVPlugin",
	57	'desc' => "{CSVPlugin.desc}",
[11918]	58	'abstract' => "no",
	59	'inherits' => "yes",
[12610]	60	'explodes' => "yes",
[11918]	61	'args' => $arguments };
	62
	63
[12610]	64	# This plugin processes files with the suffix ".csv"
	65	sub get_default_process_exp {
	66	return q^(?i)(\.csv)$^;
	67	}
	68
	69
	70	# This plugin splits the input text by line
	71	sub get_default_split_exp {
	72	return q^\r?\n^;
	73	}
	74
	75
[11918]	76	sub new
	77	{
	78	my ($class) = shift (@_);
	79	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	80	push(@$pluginlist, $class);
	81
[15872]	82	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
	83	push(@{$hashArgOptLists->{"OptList"}}, $options);
[11918]	84
[15872]	85	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[11918]	86
	87	return bless $self, $class;
	88	}
	89
	90
[12610]	91	sub read_file
[11918]	92	{
[12610]	93	my $self = shift (@_);
	94	my ($filename, $encoding, $language, $textref) = @_;
	95	my $outhandle = $self->{'outhandle'};
[11918]	96
[12610]	97	# Read the CSV file content
	98	open(FILE, $filename);
	99	my $reader = new multiread();
[15872]	100	$reader->set_handle('CSVPlugin::FILE');
[12610]	101	$reader->set_encoding($encoding);
	102	$reader->read_file($textref);
	103	close(FILE);
[11918]	104
[12610]	105	# Remove any blank lines so the data is split and processed properly
	106	$$textref =~ s/\n(\s*)\n/\n/g;
	107
	108	# The first line contains the metadata element names
	109	$$textref =~ s/^(.*?)\r?\n//;
[12627]	110	my @csv_file_fields = ();
	111	my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
	112	while ($csv_file_field_line ne "") {
	113	# Handle quoted values
	114	if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
	115	my $csv_file_field = $1;
	116	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
	117	push(@csv_file_fields, $csv_file_field);
	118	}
	119	# Normal comma-separated case
	120	elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
	121	my $csv_file_field = $1;
	122	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
	123	push(@csv_file_fields, $csv_file_field);
	124	}
	125	# The line must be formatted incorrectly
	126	else {
	127	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
	128	last;
	129	}
[12610]	130	}
	131	$self->{'csv_file_fields'} = \@csv_file_fields;
[11918]	132	}
	133
	134
[12610]	135	sub process
[11918]	136	{
	137	my $self = shift (@_);
[12610]	138	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	139	my $outhandle = $self->{'outhandle'};
[11918]	140
[12610]	141	my $section = $doc_obj->get_top_section();
	142	my $csv_line = $$textref;
	143	my @csv_file_fields = @{$self->{'csv_file_fields'}};
	144
	145	# Add the raw line as the document text
	146	$doc_obj->add_utf8_text($section, $csv_line);
[11918]	147
[12610]	148	# Build a hash of metadata name to metadata value for this line
	149	my $i = 0;
	150	$csv_line .= ","; # To make the regular expressions simpler
	151	while ($csv_line ne "") {
	152	# Metadata values containing commas are quoted
	153	if ($csv_line =~ s/^\"(.*?)\"\,//) {
	154	# Only bother with non-empty values
	155	if ($1 ne "" && defined($csv_file_fields[$i])) {
	156	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
[11918]	157	}
[12610]	158	}
	159	# Normal comma-separated case
	160	elsif ($csv_line =~ s/^(.*?)\,//) {
	161	# Only bother with non-empty values
	162	if ($1 ne "" && defined($csv_file_fields[$i])) {
	163	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
[11918]	164	}
	165	}
[12610]	166	# The line must be formatted incorrectly
	167	else {
	168	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
	169	last;
[11918]	170	}
	171
[12610]	172	$i++;
[11918]	173	}
[12610]	174
	175	# Record was processed successfully
	176	return 1;
[11918]	177	}
	178
	179
	180	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: