Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 31888

Last change on this file since 31888 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:keywords set to `Author Date Id Revision`
File size: 5.0 KB

Rev	Line
[11918]	1	###########################################################################
	2	#
[15872]	3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
[11918]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2006 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package CSVPlugin;
[11918]	28
	29
[15872]	30	use SplitTextFile;
[24794]	31	use MetadataRead;
[11918]	32	use strict;
[12610]	33	no strict 'refs'; # allow filehandles to be variables and viceversa
[11918]	34
	35
[15872]	36	# CSVPlugin is a sub-class of SplitTextFile.
[11918]	37	sub BEGIN {
[24794]	38	@CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[11918]	39	}
	40
	41
[12610]	42	my $arguments =
	43	[ { 'name' => "process_exp",
[31492]	44	'desc' => "{BaseImporter.process_exp}",
[11918]	45	'type' => "regexp",
	46	'reqd' => "no",
[12610]	47	'deft' => &get_default_process_exp() },
	48	{ 'name' => "split_exp",
[15872]	49	'desc' => "{SplitTextFile.split_exp}",
[12610]	50	'type' => "regexp",
	51	'reqd' => "no",
	52	'deft' => &get_default_split_exp(),
	53	'hiddengli' => "yes" }
	54	];
[11918]	55
	56
[15872]	57	my $options = { 'name' => "CSVPlugin",
	58	'desc' => "{CSVPlugin.desc}",
[11918]	59	'abstract' => "no",
	60	'inherits' => "yes",
[12610]	61	'explodes' => "yes",
[11918]	62	'args' => $arguments };
	63
	64
[12610]	65	# This plugin processes files with the suffix ".csv"
	66	sub get_default_process_exp {
	67	return q^(?i)(\.csv)$^;
	68	}
	69
	70
	71	# This plugin splits the input text by line
	72	sub get_default_split_exp {
	73	return q^\r?\n^;
	74	}
	75
	76
[11918]	77	sub new
	78	{
	79	my ($class) = shift (@_);
	80	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	81	push(@$pluginlist, $class);
	82
[15872]	83	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
	84	push(@{$hashArgOptLists->{"OptList"}}, $options);
[11918]	85
[15872]	86	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[11918]	87
	88	return bless $self, $class;
	89	}
	90
	91
[12610]	92	sub read_file
[11918]	93	{
[12610]	94	my $self = shift (@_);
	95	my ($filename, $encoding, $language, $textref) = @_;
[11918]	96
[28782]	97	# Read in file the usual ReadTextFile way
	98	# This ensure that $textref is a unicode aware string
	99	$self->SUPER::read_file(@_);
[11918]	100
[28782]	101	#
	102	# Now top-up the processing of the text with what this plugin
	103	# needs
	104	#
	105
[12610]	106	# Remove any blank lines so the data is split and processed properly
	107	$$textref =~ s/\n(\s*)\n/\n/g;
	108
	109	# The first line contains the metadata element names
	110	$$textref =~ s/^(.*?)\r?\n//;
[12627]	111	my @csv_file_fields = ();
	112	my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
	113	while ($csv_file_field_line ne "") {
	114	# Handle quoted values
	115	if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
	116	my $csv_file_field = $1;
	117	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
	118	push(@csv_file_fields, $csv_file_field);
	119	}
	120	# Normal comma-separated case
	121	elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
	122	my $csv_file_field = $1;
	123	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
	124	push(@csv_file_fields, $csv_file_field);
	125	}
	126	# The line must be formatted incorrectly
	127	else {
	128	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
	129	last;
	130	}
[12610]	131	}
	132	$self->{'csv_file_fields'} = \@csv_file_fields;
[11918]	133	}
	134
	135
[12610]	136	sub process
[11918]	137	{
	138	my $self = shift (@_);
[12610]	139	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	140	my $outhandle = $self->{'outhandle'};
[11918]	141
[12610]	142	my $section = $doc_obj->get_top_section();
	143	my $csv_line = $$textref;
	144	my @csv_file_fields = @{$self->{'csv_file_fields'}};
	145
	146	# Add the raw line as the document text
	147	$doc_obj->add_utf8_text($section, $csv_line);
[11918]	148
[12610]	149	# Build a hash of metadata name to metadata value for this line
	150	my $i = 0;
	151	$csv_line .= ","; # To make the regular expressions simpler
	152	while ($csv_line ne "") {
	153	# Metadata values containing commas are quoted
	154	if ($csv_line =~ s/^\"(.*?)\"\,//) {
	155	# Only bother with non-empty values
	156	if ($1 ne "" && defined($csv_file_fields[$i])) {
	157	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
[11918]	158	}
[12610]	159	}
	160	# Normal comma-separated case
	161	elsif ($csv_line =~ s/^(.*?)\,//) {
	162	# Only bother with non-empty values
	163	if ($1 ne "" && defined($csv_file_fields[$i])) {
	164	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
[11918]	165	}
	166	}
[12610]	167	# The line must be formatted incorrectly
	168	else {
	169	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
	170	last;
[11918]	171	}
	172
[12610]	173	$i++;
[11918]	174	}
[12610]	175
	176	# Record was processed successfully
	177	return 1;
[11918]	178	}
	179
	180
	181	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: