Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 21742

Last change on this file since 21742 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 5.0 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29
30	use SplitTextFile;
31	use strict;
32	no strict 'refs'; # allow filehandles to be variables and viceversa
33
34
35	# CSVPlugin is a sub-class of SplitTextFile.
36	sub BEGIN {
37	@CSVPlugin::ISA = ('SplitTextFile');
38	}
39
40
41	my $arguments =
42	[ { 'name' => "process_exp",
43	'desc' => "{BasePlugin.process_exp}",
44	'type' => "regexp",
45	'reqd' => "no",
46	'deft' => &get_default_process_exp() },
47	{ 'name' => "split_exp",
48	'desc' => "{SplitTextFile.split_exp}",
49	'type' => "regexp",
50	'reqd' => "no",
51	'deft' => &get_default_split_exp(),
52	'hiddengli' => "yes" }
53	];
54
55
56	my $options = { 'name' => "CSVPlugin",
57	'desc' => "{CSVPlugin.desc}",
58	'abstract' => "no",
59	'inherits' => "yes",
60	'explodes' => "yes",
61	'args' => $arguments };
62
63
64	# This plugin processes files with the suffix ".csv"
65	sub get_default_process_exp {
66	return q^(?i)(\.csv)$^;
67	}
68
69
70	# This plugin splits the input text by line
71	sub get_default_split_exp {
72	return q^\r?\n^;
73	}
74
75
76	sub new
77	{
78	my ($class) = shift (@_);
79	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80	push(@$pluginlist, $class);
81
82	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
83	push(@{$hashArgOptLists->{"OptList"}}, $options);
84
85	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
86
87	return bless $self, $class;
88	}
89
90
91	sub read_file
92	{
93	my $self = shift (@_);
94	my ($filename, $encoding, $language, $textref) = @_;
95	my $outhandle = $self->{'outhandle'};
96
97	# Read the CSV file content
98	open(FILE, $filename);
99	my $reader = new multiread();
100	$reader->set_handle('CSVPlugin::FILE');
101	$reader->set_encoding($encoding);
102	$reader->read_file($textref);
103	close(FILE);
104
105	# Remove any blank lines so the data is split and processed properly
106	$$textref =~ s/\n(\s*)\n/\n/g;
107
108	# The first line contains the metadata element names
109	$$textref =~ s/^(.*?)\r?\n//;
110	my @csv_file_fields = ();
111	my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
112	while ($csv_file_field_line ne "") {
113	# Handle quoted values
114	if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
115	my $csv_file_field = $1;
116	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
117	push(@csv_file_fields, $csv_file_field);
118	}
119	# Normal comma-separated case
120	elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
121	my $csv_file_field = $1;
122	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
123	push(@csv_file_fields, $csv_file_field);
124	}
125	# The line must be formatted incorrectly
126	else {
127	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
128	last;
129	}
130	}
131	$self->{'csv_file_fields'} = \@csv_file_fields;
132	}
133
134
135	sub process
136	{
137	my $self = shift (@_);
138	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
139	my $outhandle = $self->{'outhandle'};
140
141	my $section = $doc_obj->get_top_section();
142	my $csv_line = $$textref;
143	my @csv_file_fields = @{$self->{'csv_file_fields'}};
144
145	# Add the raw line as the document text
146	$doc_obj->add_utf8_text($section, $csv_line);
147
148	# Build a hash of metadata name to metadata value for this line
149	my $i = 0;
150	$csv_line .= ","; # To make the regular expressions simpler
151	while ($csv_line ne "") {
152	# Metadata values containing commas are quoted
153	if ($csv_line =~ s/^\"(.*?)\"\,//) {
154	# Only bother with non-empty values
155	if ($1 ne "" && defined($csv_file_fields[$i])) {
156	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
157	}
158	}
159	# Normal comma-separated case
160	elsif ($csv_line =~ s/^(.*?)\,//) {
161	# Only bother with non-empty values
162	if ($1 ne "" && defined($csv_file_fields[$i])) {
163	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
164	}
165	}
166	# The line must be formatted incorrectly
167	else {
168	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
169	last;
170	}
171
172	$i++;
173	}
174
175	# Record was processed successfully
176	return 1;
177	}
178
179
180	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: