Context Navigation

source: trunk/gsdl/perllib/plugins/CSVPlug.pm@ 12610

Last change on this file since 12610 was 12610, checked in by mdewsnip, 18 years ago
Essentially a brand-new plugin (the old CSVPlug has been renamed to MetadataCSVPlug). This plugin uses SplitPlug to split CSV files into lines, and creates a new document for each line, with the metadata specified. The first line of the CSV file must contain the metadata element names.
Property svn:keywords set to `Author Date Id Revision`
File size: 4.7 KB

Line
1	###########################################################################
2	#
3	# CSVPlug.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlug;
28
29
30	use SplitPlug;
31	use strict;
32	no strict 'refs'; # allow filehandles to be variables and viceversa
33
34
35	# CSVPlug is a sub-class of SplitPlug.
36	sub BEGIN {
37	@CSVPlug::ISA = ('SplitPlug');
38	}
39
40
41	my $arguments =
42	[ { 'name' => "process_exp",
43	'desc' => "{BasPlug.process_exp}",
44	'type' => "regexp",
45	'reqd' => "no",
46	'deft' => &get_default_process_exp() },
47	{ 'name' => "split_exp",
48	'desc' => "{SplitPlug.split_exp}",
49	'type' => "regexp",
50	'reqd' => "no",
51	'deft' => &get_default_split_exp(),
52	'hiddengli' => "yes" }
53	];
54
55
56	my $options = { 'name' => "CSVPlug",
57	'desc' => "{CSVPlug.desc}",
58	'abstract' => "no",
59	'inherits' => "yes",
60	'explodes' => "yes",
61	'args' => $arguments };
62
63
64	# This plugin processes files with the suffix ".csv"
65	sub get_default_process_exp {
66	return q^(?i)(\.csv)$^;
67	}
68
69
70	# This plugin splits the input text by line
71	sub get_default_split_exp {
72	return q^\r?\n^;
73	}
74
75
76	sub new
77	{
78	my ($class) = shift (@_);
79	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80	push(@$pluginlist, $class);
81
82	if (defined $arguments) { push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});}
83	if (defined $options) { push(@{$hashArgOptLists->{"OptList"}}, $options)};
84
85	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
86
87	return bless $self, $class;
88	}
89
90
91	sub read_file
92	{
93	my $self = shift (@_);
94	my ($filename, $encoding, $language, $textref) = @_;
95	my $outhandle = $self->{'outhandle'};
96
97	# Read the CSV file content
98	open(FILE, $filename);
99	my $reader = new multiread();
100	$reader->set_handle('CSVPlug::FILE');
101	$reader->set_encoding($encoding);
102	$reader->read_file($textref);
103	close(FILE);
104
105	# Remove any blank lines so the data is split and processed properly
106	$$textref =~ s/\n(\s*)\n/\n/g;
107
108	# The first line contains the metadata element names
109	$$textref =~ s/^(.*?)\r?\n//;
110	my $csv_file_field_line = $1;
111	my @csv_file_fields = split(/\,/, $csv_file_field_line);
112	for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
113	# Remove any spaces from the field names
114	$csv_file_fields[$i] =~ s/ //g;
115	}
116	$self->{'csv_file_fields'} = \@csv_file_fields;
117	}
118
119
120	sub process
121	{
122	my $self = shift (@_);
123	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
124	my $outhandle = $self->{'outhandle'};
125
126	my $section = $doc_obj->get_top_section();
127	my $csv_line = $$textref;
128	my @csv_file_fields = @{$self->{'csv_file_fields'}};
129
130	# Report that we're processing the file
131	print STDERR "\n<Processing n='$file' p='CSVPlug'>\n" if ($gli);
132	print $outhandle "CSVPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
133
134	# Add the raw line as the document text
135	$doc_obj->add_utf8_text($section, $csv_line);
136
137	# Build a hash of metadata name to metadata value for this line
138	my $i = 0;
139	$csv_line .= ","; # To make the regular expressions simpler
140	while ($csv_line ne "") {
141	# Metadata values containing commas are quoted
142	if ($csv_line =~ s/^\"(.*?)\"\,//) {
143	# Only bother with non-empty values
144	if ($1 ne "" && defined($csv_file_fields[$i])) {
145	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
146	}
147	}
148	# Normal comma-separated case
149	elsif ($csv_line =~ s/^(.*?)\,//) {
150	# Only bother with non-empty values
151	if ($1 ne "" && defined($csv_file_fields[$i])) {
152	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
153	}
154	}
155	# The line must be formatted incorrectly
156	else {
157	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
158	last;
159	}
160
161	$i++;
162	}
163
164	# Record was processed successfully
165	return 1;
166	}
167
168
169	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: